diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b56b4a44e4a55540c6fdc3e0873f58a409f6e2b8 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: true + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..20e1de6e9e0c9570b2e880659fb8419389098566 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..524aa3462c384f98ce183bddb516e57eb44f10c1 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-18-07/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b56b4a44e4a55540c6fdc3e0873f58a409f6e2b8 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: true + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b957d355a727fca1d251b04fca5845220317331f --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..524aa3462c384f98ce183bddb516e57eb44f10c1 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-20-59/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b56b4a44e4a55540c6fdc3e0873f58a409f6e2b8 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: true + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c290ffc0bc650dd0d7560f57f3dec106ac85c173 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..524aa3462c384f98ce183bddb516e57eb44f10c1 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-24-15/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: false + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af205efd98c461db581b5e14fb2d45f78553bd73 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=False + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=False +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: false + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..161200400716bc2ccb65b8e81dea22b9c52b1c5f --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=False + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=False +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-33-01/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: false + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c6ea002941bc3746cc9ed357c9b0ced4c1e5969 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=False + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=False +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-35-38/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: false + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7792025f706af9ad26eb8c208a4ba75b6ebf08af --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=False + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=False +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-41-08/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/config.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1491ed06481f53b0d9b7b8f938bbc797a19ccc7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/config.yaml @@ -0,0 +1,169 @@ +data: + tokenizer: null + train_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + train_data_num: null + val_data_num: null + prompt_key: prompt + max_prompt_length: 4096 + max_response_length: 1024 + max_start_length: 256 + max_obs_length: 512 + train_batch_size: 128 + val_batch_size: 64 + return_raw_input_ids: false + return_raw_chat: false + shuffle_train_dataloader: true +actor_rollout_ref: + hybrid_engine: true + model: + path: Qwen/Qwen3-4B-Instruct-2507 + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + use_remove_padding: false + actor: + strategy: fsdp + ppo_mini_batch_size: 64 + ppo_micro_batch_size: 64 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + grad_clip: 1.0 + state_masking: false + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: false + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + ulysses_sequence_parallel_size: 1 + optim: + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + fsdp_config: + wrap_policy: + min_num_params: 0 + param_offload: true + grad_offload: false + optimizer_offload: true + fsdp_size: -1 + ppo_micro_batch_size_per_gpu: 16 + ref: + fsdp_config: + param_offload: true + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} + rollout: + name: vllm + temperature: 1.0 + top_k: -1 + top_p: 0.95 + prompt_length: ${data.max_prompt_length} + response_length: ${data.max_response_length} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + free_cache_engine: true + load_format: dummy_dtensor + tensor_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 64 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + do_sample: true + 'n': 1 + n_agent: 1 +critic: + strategy: fsdp + optim: + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + min_lr_ratio: null + warmup_style: constant + total_training_steps: -1 + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: {} + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: false + use_remove_padding: false + fsdp_config: + param_offload: false + grad_offload: false + optimizer_offload: false + wrap_policy: + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 +reward_model: + enable: false + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: false + fsdp_config: + min_num_params: 0 + param_offload: false + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + structure_format_score: 0 + final_format_score: 0 + retrieval_score: 0 +retriever: + url: http://127.0.0.1:8000/retrieve + topk: 3 +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + no_think_rl: false + kl_penalty: kl + kl_ctrl: + type: fixed + kl_coef: 0.001 + state_masking: + start_state_marker: + end_state_marker: +trainer: + total_epochs: 15 + total_training_steps: 1005 + project_name: '' + experiment_name: llm_guard_3B_10k_v2 + logger: + - wandb + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 100 + test_freq: 50 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: verl_checkpoints/llm_guard_3B_10k_v2 +max_turns: 1 +do_search: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/hydra.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dfd02ebf37d6209e13bbe8cbf1a9cb005bc99507 --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/hydra.yaml @@ -0,0 +1,189 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet + - data.train_batch_size=128 + - data.val_batch_size=64 + - data.max_prompt_length=4096 + - data.max_response_length=1024 + - data.shuffle_train_dataloader=True + - algorithm.adv_estimator=grpo + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.model.enable_gradient_checkpointing=true + - actor_rollout_ref.model.use_remove_padding=False + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.actor.ppo_mini_batch_size=64 + - +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.fsdp_config.param_offload=true + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=true + - actor_rollout_ref.rollout.log_prob_micro_batch_size=64 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.ref.log_prob_micro_batch_size=64 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - trainer.logger=[wandb] + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=100 + - trainer.test_freq=50 + - trainer.project_name= + - trainer.experiment_name=llm_guard_3B_10k_v2 + - trainer.total_epochs=15 + - trainer.total_training_steps=1005 + - trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 + - do_search=false + - max_turns=1 + job: + name: main_ppo + chdir: null + override_dirname: +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.fsdp_config.optimizer_offload=true,actor_rollout_ref.actor.fsdp_config.param_offload=true,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_mini_batch_size=64,actor_rollout_ref.model.enable_gradient_checkpointing=true,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=False,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size=64,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,data.max_prompt_length=4096,data.max_response_length=1024,data.shuffle_train_dataloader=True,data.train_batch_size=128,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet,data.val_batch_size=64,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet,do_search=false,max_turns=1,trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2,trainer.experiment_name=llm_guard_3B_10k_v2,trainer.logger=[wandb],trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=,trainer.save_freq=100,trainer.test_freq=50,trainer.total_epochs=15,trainer.total_training_steps=1005 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1 + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/overrides.yaml b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..568f9b0638e61737a89a1cdab9fa0d26f07edd7b --- /dev/null +++ b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/.hydra/overrides.yaml @@ -0,0 +1,35 @@ +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet +- data.train_batch_size=128 +- data.val_batch_size=64 +- data.max_prompt_length=4096 +- data.max_response_length=1024 +- data.shuffle_train_dataloader=True +- algorithm.adv_estimator=grpo +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.model.enable_gradient_checkpointing=true +- actor_rollout_ref.model.use_remove_padding=False +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.actor.ppo_mini_batch_size=64 +- +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.fsdp_config.param_offload=true +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=true +- actor_rollout_ref.rollout.log_prob_micro_batch_size=64 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.ref.log_prob_micro_batch_size=64 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- trainer.logger=[wandb] +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=100 +- trainer.test_freq=50 +- trainer.project_name= +- trainer.experiment_name=llm_guard_3B_10k_v2 +- trainer.total_epochs=15 +- trainer.total_training_steps=1005 +- trainer.default_local_dir=verl_checkpoints/llm_guard_3B_10k_v2 +- do_search=false +- max_turns=1 diff --git a/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/main_ppo.log b/code/RL_model/verl/Search-R1/outputs/2026-02-01/20-42-57/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/__init__.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py new file mode 100644 index 0000000000000000000000000000000000000000..6b68cb003ac3f943d45eb8d5cf48a7ebee5cd1f6 --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py @@ -0,0 +1,469 @@ +import torch +import re +from collections import defaultdict +import os +from typing import List, Dict, Any, Tuple +from dataclasses import dataclass +from .tensor_helper import TensorHelper, TensorConfig +from verl import DataProto +from verl.utils.tracking import Tracking +import shutil +import requests + +@dataclass +class GenerationConfig: + max_turns: int + max_start_length: int + max_prompt_length: int + max_response_length: int + max_obs_length: int + num_gpus: int + no_think_rl: bool=False + search_url: str = None + topk: int = 3 + +class LLMGenerationManager: + def __init__( + self, + tokenizer, + actor_rollout_wg, + config: GenerationConfig, + is_validation: bool = False, + ): + self.tokenizer = tokenizer + self.actor_rollout_wg = actor_rollout_wg + self.config = config + self.is_validation = is_validation + + self.tensor_fn = TensorHelper(TensorConfig( + pad_token_id=tokenizer.pad_token_id, + max_prompt_length=config.max_prompt_length, + max_obs_length=config.max_obs_length, + max_start_length=config.max_start_length + )) + + def _batch_tokenize(self, responses: List[str]) -> torch.Tensor: + """Tokenize a batch of responses.""" + return self.tokenizer( + responses, + add_special_tokens=False, + return_tensors='pt', + padding="longest" + )['input_ids'] + + def _postprocess_responses(self, responses: torch.Tensor) -> torch.Tensor: + """Process responses to stop at search operation or answer operation.""" + responses_str = self.tokenizer.batch_decode( + responses, + skip_special_tokens=True + ) + + responses_str = [resp.split('')[0] + '' + if '' in resp + else resp.split('')[0] + '' + if '' in resp + else resp + for resp in responses_str] + + if self.config.no_think_rl: + raise ValueError('stop') + # if no_think_rl is enabled, only keep action in the str + actions, _ = self.env.postprocess_predictions(responses_str) + responses_str=[f"{envs[idx].ACTION_LOOKUP[action]}" for idx, action in enumerate(actions)] + print("RESPONSES:", responses_str) + responses = self._batch_tokenize(responses_str) + return responses, responses_str + + def _process_next_obs(self, next_obs: List[str]) -> torch.Tensor: + """Process next observations from environment.""" + + next_obs_ids = self.tokenizer( + next_obs, + padding='longest', + return_tensors='pt', + add_special_tokens=False, # Prevents adding special tokens + )['input_ids'] + + if next_obs_ids.shape[1] > self.config.max_obs_length: + print(f"[WARNING] OBSERVATION TOO LONG, CONSIDER CHANGING YOUR CONFIG, {next_obs_ids.shape[1]} & {self.config.max_obs_length}") + next_obs_ids = next_obs_ids[:, :self.config.max_obs_length] + + return next_obs_ids + + def _update_rolling_state(self, rollings: DataProto, cur_responses: torch.Tensor, + next_obs_ids: torch.Tensor) -> Dict: + """Update rolling state with new responses and observations.""" + # Concatenate and handle padding + new_input_ids = self.tensor_fn.concatenate_with_padding([ + rollings.batch['input_ids'], + cur_responses, + next_obs_ids + ]) + + # Create attention mask and position ids + new_attention_mask = self.tensor_fn.create_attention_mask(new_input_ids) + new_position_ids = self.tensor_fn.create_position_ids(new_attention_mask) + + # Cut to appropriate length + effective_len = new_attention_mask.sum(dim=1).max() + max_len = min(self.config.max_prompt_length, effective_len) + + new_rollings = DataProto.from_dict({ + 'input_ids': new_input_ids[:, -max_len:], + 'position_ids': new_position_ids[:, -max_len:], + 'attention_mask': new_attention_mask[:, -max_len:] + }) + new_rollings.meta_info.update(rollings.meta_info) + + return new_rollings + + def _info_masked_concatenate_with_padding(self, + prompt: torch.Tensor, + prompt_with_mask: torch.Tensor, + response: torch.Tensor, + info: torch.Tensor = None, + pad_to_left: bool = True + ) -> torch.Tensor: + """Concatenate tensors and handle padding. Additionally, create a mask (info_mask) to cover the information block if it exists.""" + pad_id = self.tokenizer.pad_token_id + tensors = [prompt, response] + tensors_with_mask = [prompt_with_mask, response] + if info is not None: + tensors.append(info) + info_mask = torch.full(info.size(), pad_id, dtype=info.dtype, device=info.device) # information mask + tensors_with_mask.append(info_mask) + + concatenated = torch.cat(tensors, dim=1) + concatenated_with_info = torch.cat(tensors_with_mask, dim=1) + mask = concatenated != pad_id if pad_to_left else concatenated == pad_id + sorted_indices = mask.to(torch.int64).argsort(dim=1, stable=True) + padded_tensor = concatenated.gather(1, sorted_indices) + padded_tensor_with_info = concatenated_with_info.gather(1, sorted_indices) + + return padded_tensor, padded_tensor_with_info + + def _update_right_side(self, right_side: Dict, + cur_responses: torch.Tensor, + next_obs_ids: torch.Tensor = None) -> Dict: + """Update right side state.""" + if next_obs_ids != None: + responses, responses_with_info_mask = self._info_masked_concatenate_with_padding( + right_side['responses'], + right_side['responses_with_info_mask'], + cur_responses, + next_obs_ids, + pad_to_left=False + ) + else: + responses, responses_with_info_mask = self._info_masked_concatenate_with_padding( + right_side['responses'], + right_side['responses_with_info_mask'], + cur_responses, + pad_to_left=False + ) + effective_len = self.tensor_fn.create_attention_mask(responses).sum(dim=1).max() + max_len = min(self.config.max_prompt_length, effective_len) + + return {'responses': responses[:, :max_len], 'responses_with_info_mask': responses_with_info_mask[:, :max_len]} + + def _generate_with_gpu_padding(self, active_batch: DataProto) -> DataProto: + """ + Wrapper for generation that handles multi-GPU padding requirements. + if num_gpus <= 1, return self.actor_rollout_wg.generate_sequences(active_batch) + if active_batch size is not divisible by num_gpus, pad with first sequence + then remove padding from output + """ + num_gpus = self.config.num_gpus + if num_gpus <= 1: + return self.actor_rollout_wg.generate_sequences(active_batch) + + batch_size = active_batch.batch['input_ids'].shape[0] + remainder = batch_size % num_gpus + + for key in active_batch.batch.keys(): + active_batch.batch[key] = active_batch.batch[key].long() + if remainder == 0: + return self.actor_rollout_wg.generate_sequences(active_batch) + + # Add padding sequences + padding_size = num_gpus - remainder + padded_batch = {} + + for k, v in active_batch.batch.items(): + # Use first sequence as padding template + pad_sequence = v[0:1].repeat(padding_size, *[1] * (len(v.shape) - 1)) + padded_batch[k] = torch.cat([v, pad_sequence], dim=0) + + padded_active_batch = DataProto.from_dict(padded_batch) + for key in padded_active_batch.batch.keys(): + padded_active_batch.batch[key] = padded_active_batch.batch[key].long() + + # Generate with padded batch + padded_output = self.actor_rollout_wg.generate_sequences(padded_active_batch) + + # Remove padding from output + trimmed_batch = {k: v[:-padding_size] for k, v in padded_output.batch.items()} + + # Handle meta_info if present + if hasattr(padded_output, 'meta_info') and padded_output.meta_info: + trimmed_meta = {} + for k, v in padded_output.meta_info.items(): + if isinstance(v, torch.Tensor): + trimmed_meta[k] = v[:-padding_size] + else: + trimmed_meta[k] = v + padded_output.meta_info = trimmed_meta + + padded_output.batch = trimmed_batch + return padded_output + + def run_llm_loop(self, gen_batch, initial_input_ids: torch.Tensor) -> Tuple[Dict, Dict]: + """Run main LLM generation loop.""" + + original_left_side = {'input_ids': initial_input_ids[:, -self.config.max_start_length:]} + original_right_side = {'responses': initial_input_ids[:, []], 'responses_with_info_mask': initial_input_ids[:, []]} + + active_mask = torch.ones(gen_batch.batch['input_ids'].shape[0], dtype=torch.bool) + turns_stats = torch.ones(gen_batch.batch['input_ids'].shape[0], dtype=torch.int) + valid_action_stats = torch.zeros(gen_batch.batch['input_ids'].shape[0], dtype=torch.int) + valid_search_stats = torch.zeros(gen_batch.batch['input_ids'].shape[0], dtype=torch.int) + active_num_list = [active_mask.sum().item()] + rollings = gen_batch + + # Main generation loop + for step in range(self.config.max_turns): + if not active_mask.sum(): + break + rollings.batch = self.tensor_fn.cut_to_effective_len( + rollings.batch, + keys=['input_ids', 'attention_mask', 'position_ids'] + ) + + # gen_output = self.actor_rollout_wg.generate_sequences(rollings) + rollings_active = DataProto.from_dict({ + k: v[active_mask] for k, v in rollings.batch.items() + }) + gen_output = self._generate_with_gpu_padding(rollings_active) + + meta_info = gen_output.meta_info + responses_ids, responses_str = self._postprocess_responses(gen_output.batch['responses']) + responses_ids, responses_str = self.tensor_fn._example_level_pad(responses_ids, responses_str, active_mask) + + # Execute in environment and process observations + next_obs, dones, valid_action, is_search = self.execute_predictions( + responses_str, self.tokenizer.pad_token, active_mask + ) + + curr_active_mask = torch.tensor([not done for done in dones], dtype=torch.bool) + active_mask = active_mask * curr_active_mask + active_num_list.append(active_mask.sum().item()) + turns_stats[curr_active_mask] += 1 + valid_action_stats += torch.tensor(valid_action, dtype=torch.int) + valid_search_stats += torch.tensor(is_search, dtype=torch.int) + + next_obs_ids = self._process_next_obs(next_obs) + + # Update states + rollings = self._update_rolling_state( + rollings, + responses_ids, + next_obs_ids + ) + original_right_side = self._update_right_side( + original_right_side, + responses_ids, + next_obs_ids + ) + + # final LLM rollout + if active_mask.sum(): + rollings.batch = self.tensor_fn.cut_to_effective_len( + rollings.batch, + keys=['input_ids', 'attention_mask', 'position_ids'] + ) + + # gen_output = self.actor_rollout_wg.generate_sequences(rollings) + rollings_active = DataProto.from_dict({ + k: v[active_mask] for k, v in rollings.batch.items() + }) + gen_output = self._generate_with_gpu_padding(rollings_active) + + meta_info = gen_output.meta_info + responses_ids, responses_str = self._postprocess_responses(gen_output.batch['responses']) + responses_ids, responses_str = self.tensor_fn._example_level_pad(responses_ids, responses_str, active_mask) + + # # Execute in environment and process observations + _, dones, valid_action, is_search = self.execute_predictions( + responses_str, self.tokenizer.pad_token, active_mask, do_search=False + ) + + curr_active_mask = torch.tensor([not done for done in dones], dtype=torch.bool) + active_mask = active_mask * curr_active_mask + active_num_list.append(active_mask.sum().item()) + valid_action_stats += torch.tensor(valid_action, dtype=torch.int) + valid_search_stats += torch.tensor(is_search, dtype=torch.int) + + + original_right_side = self._update_right_side( + original_right_side, + responses_ids, + ) + + meta_info['turns_stats'] = turns_stats.tolist() + meta_info['active_mask'] = active_mask.tolist() + meta_info['valid_action_stats'] = valid_action_stats.tolist() + meta_info['valid_search_stats'] = valid_search_stats.tolist() + + print("ACTIVE_TRAJ_NUM:", active_num_list) + + return self._compose_final_output(original_left_side, original_right_side, meta_info) + + def _compose_final_output(self, left_side: Dict, + right_side: Dict, + meta_info: Dict) -> Tuple[Dict, Dict]: + """Compose final generation output.""" + final_output = right_side.copy() + final_output['prompts'] = left_side['input_ids'] + + # Combine input IDs + final_output['input_ids'] = torch.cat([ + left_side['input_ids'], + right_side['responses'] + ], dim=1) + + # Create attention mask and position ids + final_output['attention_mask'] = torch.cat([ + self.tensor_fn.create_attention_mask(left_side['input_ids']), + self.tensor_fn.create_attention_mask(final_output['responses']) + ], dim=1) + final_output['info_mask'] = torch.cat([ + self.tensor_fn.create_attention_mask(left_side['input_ids']), + self.tensor_fn.create_attention_mask(final_output['responses_with_info_mask']) + ], dim=1) + + final_output['position_ids'] = self.tensor_fn.create_position_ids( + final_output['attention_mask'] + ) + + final_output = DataProto.from_dict(final_output) + final_output.meta_info.update(meta_info) + + return final_output + + def execute_predictions(self, predictions: List[str], pad_token: str, active_mask=None, do_search=True) -> List[str]: + """ + Execute predictions across multiple environments. + NOTE: the function is the actual `step` function in the environment + NOTE penalty_for_invalid is not included in observation shown to the LLM + + Args: + envs: List of environment instances + predictions: List of action predictions + pad_token: Token to use for padding + + Returns: + List of observation strings + """ + cur_actions, contents = self.postprocess_predictions(predictions) + next_obs, dones, valid_action, is_search = [], [], [], [] + + search_queries = [content for action, content in zip(cur_actions, contents) if action == 'search'] + if do_search: + search_results = self.batch_search(search_queries) + assert len(search_results) == sum([1 for action in cur_actions if action == 'search']) + else: + search_results = [''] * sum([1 for action in cur_actions if action == 'search']) + + for i, (action, active) in enumerate(zip(cur_actions, active_mask)): + + if not active: + next_obs.append('') + dones.append(1) + valid_action.append(0) + is_search.append(0) + else: + if action == 'answer': + next_obs.append('') + dones.append(1) + valid_action.append(1) + is_search.append(0) + elif action == 'search': + next_obs.append(f'\n\n{search_results.pop(0).strip()}\n\n') + dones.append(0) + valid_action.append(1) + is_search.append(1) + else: + next_obs.append(f'\nMy previous action is invalid. \ +If I want to search, I should put the query between and . \ +If I want to give the final answer, I should put the answer between and . Let me try again.\n') + dones.append(0) + valid_action.append(0) + is_search.append(0) + + assert len(search_results) == 0 + + return next_obs, dones, valid_action, is_search + + def postprocess_predictions(self, predictions: List[Any]) -> Tuple[List[int], List[bool]]: + """ + Process (text-based) predictions from llm into actions and validity flags. + + Args: + predictions: List of raw predictions + + Returns: + Tuple of (actions list, validity flags list) + """ + actions = [] + contents = [] + + for prediction in predictions: + if isinstance(prediction, str): # for llm output + pattern = r'<(search|answer)>(.*?)' + match = re.search(pattern, prediction, re.DOTALL) + if match: + content = match.group(2).strip() # Return only the content inside the tags + action = match.group(1) + else: + content = '' + action = None + else: + raise ValueError(f"Invalid prediction type: {type(prediction)}") + + actions.append(action) + contents.append(content) + + return actions, contents + + def batch_search(self, queries: List[str] = None) -> str: + """ + Batchified search for queries. + Args: + queries: queries to call the search engine + Returns: + search results which is concatenated into a string + """ + results = self._batch_search(queries)['result'] + + return [self._passages2string(result) for result in results] + + def _batch_search(self, queries): + + payload = { + "queries": queries, + "topk": self.config.topk, + "return_scores": True + } + + return requests.post(self.config.search_url, json=payload).json() + + def _passages2string(self, retrieval_result): + format_reference = '' + for idx, doc_item in enumerate(retrieval_result): + + content = doc_item['document']['contents'] + title = content.split("\n")[0] + text = "\n".join(content.split("\n")[1:]) + format_reference += f"Doc {idx+1}(Title: {title}) {text}\n" + + return format_reference diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..15a7c7c084c4f952533f43b214f987db81075255 --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py @@ -0,0 +1,75 @@ +import torch +from typing import Dict, Tuple, List +from dataclasses import dataclass + +@dataclass +class TensorConfig: + pad_token_id: int + max_prompt_length: int + max_obs_length: int + max_start_length: int + +class TensorHelper: + def __init__(self, config: TensorConfig): + self.config = config + + def cut_to_effective_len(self, tensor_dict: Dict[str, torch.Tensor], + keys: List[str], cut_left: bool = True) -> Dict[str, torch.Tensor]: + """Cut tensors to their effective length based on attention mask.""" + effective_len = tensor_dict['attention_mask'].sum(dim=1).max() + result = tensor_dict.copy() + + for key in keys: + if cut_left: + result[key] = tensor_dict[key][:, -effective_len:] + else: + result[key] = tensor_dict[key][:, :effective_len] + return result + + def convert_pad_structure(self, tensor: torch.Tensor, pad_to_left: bool = True) -> Tuple[torch.Tensor, torch.Tensor]: + """Convert padding structure and return sorted tensor with indices.""" + mask = tensor != self.config.pad_token_id if pad_to_left else tensor == self.config.pad_token_id + sorted_indices = mask.to(torch.int64).argsort(dim=1, stable=True) + return tensor.gather(1, sorted_indices), sorted_indices + + def create_attention_mask(self, input_ids: torch.Tensor) -> torch.Tensor: + """Create attention mask from input ids.""" + return torch.where(input_ids != self.config.pad_token_id, 1, 0) + + def create_position_ids(self, attention_mask: torch.Tensor) -> torch.Tensor: + """Create position ids from attention mask.""" + return (torch.cumsum(attention_mask, dim=1) - 1) * attention_mask + + def concatenate_with_padding(self, tensors: List[torch.Tensor], + pad_to_left: bool = True) -> torch.Tensor: + """Concatenate tensors and handle padding.""" + concatenated = torch.cat(tensors, dim=1) + padded_tensor, _ = self.convert_pad_structure(concatenated, pad_to_left) + return padded_tensor + + def _example_level_pad(self, responses: torch.Tensor, + responses_str: List[str], + active_mask: torch.Tensor) -> Tuple[torch.Tensor, List[str]]: + """ + Pad responses for non-active examples with pad tokens. + """ + assert active_mask.sum() == responses.shape[0] + # Create masked responses tensor + batch_size = active_mask.shape[0] + seq_len = responses.shape[1] + padded_responses = torch.full( + (batch_size, seq_len), self.config.pad_token_id, + dtype=responses.dtype, device=responses.device + ) + padded_responses[active_mask] = responses + + # Create masked response strings + padded_responses_str = [""] * batch_size + + s = 0 + for i, is_active in enumerate(active_mask): + if is_active: + padded_responses_str[i] = responses_str[s] + s += 1 + + return padded_responses, padded_responses_str \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh b/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh new file mode 100644 index 0000000000000000000000000000000000000000..05556a3939471d956360bc1f91d7043e19c73a85 --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh @@ -0,0 +1,19 @@ + +corpus_file=/your/corpus/jsonl/file # jsonl +save_dir=/the/path/to/save/index +retriever_name=e5 # this is for indexing naming +retriever_model=intfloat/e5-base-v2 + +# change faiss_type to HNSW32/64/128 for ANN indexing +# change retriever_name to bm25 for BM25 indexing +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python index_builder.py \ + --retrieval_method $retriever_name \ + --model_path $retriever_model \ + --corpus_path $corpus_file \ + --save_dir $save_dir \ + --use_fp16 \ + --max_length 256 \ + --batch_size 512 \ + --pooling_method mean \ + --faiss_type Flat \ + --save_embedding diff --git a/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py b/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py new file mode 100644 index 0000000000000000000000000000000000000000..ad72aeefae69d0796f137557ad8f3bb0d2381be6 --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py @@ -0,0 +1,202 @@ +import os +import re +import requests +import argparse +import asyncio +import random +from typing import List, Optional, Dict +from concurrent.futures import ThreadPoolExecutor + +import chardet +import aiohttp +import bs4 +import uvicorn +from fastapi import FastAPI +from pydantic import BaseModel +from googleapiclient.discovery import build + + +# --- CLI Args --- +parser = argparse.ArgumentParser(description="Launch online search server.") +parser.add_argument('--api_key', type=str, required=True, help="API key for Google search") +parser.add_argument('--cse_id', type=str, required=True, help="CSE ID for Google search") +parser.add_argument('--topk', type=int, default=3, help="Number of results to return per query") +parser.add_argument('--snippet_only', action='store_true', help="If set, only return snippets; otherwise, return full context.") +args = parser.parse_args() + + +# --- Config --- +class OnlineSearchConfig: + def __init__(self, topk: int = 3, api_key: Optional[str] = None, cse_id: Optional[str] = None, snippet_only: bool = False): + self.topk = topk + self.api_key = api_key + self.cse_id = cse_id + self.snippet_only = snippet_only + + +# --- Utilities --- +def parse_snippet(snippet: str) -> List[str]: + segments = snippet.split("...") + return [s.strip() for s in segments if len(s.strip().split()) > 5] + + +def sanitize_search_query(query: str) -> str: + # Remove or replace special characters that might cause issues. + # This is a basic example; you might need to add more characters or patterns. + sanitized_query = re.sub(r'[^\w\s]', ' ', query) # Replace non-alphanumeric and non-whitespace with spaces. + sanitized_query = re.sub(r'[\t\r\f\v\n]', ' ', sanitized_query) # replace tab, return, formfeed, vertical tab with spaces. + sanitized_query = re.sub(r'\s+', ' ', sanitized_query).strip() #remove duplicate spaces, and trailing/leading spaces. + + return sanitized_query + + +def filter_links(search_results: List[Dict]) -> List[str]: + links = [] + for result in search_results: + for item in result.get("items", []): + if "mime" in item: + continue + ext = os.path.splitext(item["link"])[1] + if ext in ["", ".html", ".htm", ".shtml"]: + links.append(item["link"]) + return links + + +async def fetch(session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore) -> str: + user_agents = [ + "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P)...", + "Mozilla/5.0 AppleWebKit/537.36...", + "Mozilla/5.0 (compatible; Googlebot/2.1; +https://www.google.com/bot.html)", + ] + headers = {"User-Agent": random.choice(user_agents)} + + async with semaphore: + try: + async with session.get(url, headers=headers) as response: + raw = await response.read() + detected = chardet.detect(raw) + encoding = detected["encoding"] or "utf-8" + return raw.decode(encoding, errors="ignore") + except (aiohttp.ClientError, asyncio.TimeoutError): + return "" + + +async def fetch_all(urls: List[str], limit: int = 8) -> List[str]: + semaphore = asyncio.Semaphore(limit) + timeout = aiohttp.ClientTimeout(total=5) + connector = aiohttp.TCPConnector(limit_per_host=limit, force_close=True) + + async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session: + tasks = [fetch(session, url, semaphore) for url in urls] + return await asyncio.gather(*tasks) + + +# --- Search Engine --- +class OnlineSearchEngine: + def __init__(self, config: OnlineSearchConfig): + self.config = config + + def collect_context(self, snippet: str, doc: str) -> str: + snippets = parse_snippet(snippet) + ctx_paras = [] + + for s in snippets: + pos = doc.replace("\n", " ").find(s) + if pos == -1: + continue + sta = pos + while sta > 0 and doc[sta] != "\n": + sta -= 1 + end = pos + len(s) + while end < len(doc) and doc[end] != "\n": + end += 1 + para = doc[sta:end].strip() + if para not in ctx_paras: + ctx_paras.append(para) + + return "\n".join(ctx_paras) + + def fetch_web_content(self, search_results: List[Dict]) -> Dict[str, str]: + links = filter_links(search_results) + contents = asyncio.run(fetch_all(links)) + content_dict = {} + for html, link in zip(contents, links): + soup = bs4.BeautifulSoup(html, "html.parser") + text = "\n".join([p.get_text() for p in soup.find_all("p")]) + content_dict[link] = text + return content_dict + + def search(self, search_term: str, num_iter: int = 1) -> List[Dict]: + service = build('customsearch', 'v1', developerKey=self.config.api_key) + results = [] + sanitize_search_term = sanitize_search_query(search_term) + if search_term.isspace(): + return results + res = service.cse().list(q=sanitize_search_term, cx=self.config.cse_id).execute() + results.append(res) + + for _ in range(num_iter - 1): + if 'nextPage' not in res.get('queries', {}): + break + start_idx = res['queries']['nextPage'][0]['startIndex'] + res = service.cse().list(q=search_term, cx=self.config.cse_id, start=start_idx).execute() + results.append(res) + + return results + + def batch_search(self, queries: List[str]) -> List[List[str]]: + with ThreadPoolExecutor() as executor: + return list(executor.map(self._retrieve_context, queries)) + + def _retrieve_context(self, query: str) -> List[str]: + + if self.config.snippet_only: + search_results = self.search(query) + contexts = [] + for result in search_results: + for item in result.get("items", []): + title = item.get("title", "") + context = ' '.join(parse_snippet(item.get("snippet", ""))) + if title != "" or context != "": + title = "No title." if not title else title + context = "No snippet available." if not context else context + contexts.append({ + 'document': {"contents": f'\"{title}\"\n{context}'}, + }) + else: + content_dict = self.fetch_web_content(search_results) + contexts = [] + for result in search_results: + for item in result.get("items", []): + link = item["link"] + title = item.get("title", "") + snippet = item.get("snippet", "") + if link in content_dict: + context = self.collect_context(snippet, content_dict[link]) + if title != "" or context != "": + title = "No title." if not title else title + context = "No snippet available." if not context else context + contexts.append({ + 'document': {"contents": f'\"{title}\"\n{context}'}, + }) + + return contexts[:self.config.topk] + + +# --- FastAPI App --- +app = FastAPI(title="Online Search Proxy Server") + +class SearchRequest(BaseModel): + queries: List[str] + +config = OnlineSearchConfig(api_key=args.api_key, cse_id=args.cse_id, topk=args.topk, snippet_only=args.snippet_only) +engine = OnlineSearchEngine(config) + +@app.post("/retrieve") +def search_endpoint(request: SearchRequest): + results = engine.batch_search(request.queries) + return {"result": results} + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py b/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..2cba65a65e3656fd6787b5a1fe024c33c630fcaf --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py @@ -0,0 +1,349 @@ +import os +import faiss +import json +import warnings +import numpy as np +from typing import cast, List, Dict +import shutil +import subprocess +import argparse +import torch +from tqdm import tqdm +# from LongRAG.retriever.utils import load_model, load_corpus, pooling +import datasets +from transformers import AutoTokenizer, AutoModel, AutoConfig + + +def load_model( + model_path: str, + use_fp16: bool = False + ): + model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = AutoModel.from_pretrained(model_path, trust_remote_code=True) + model.eval() + model.cuda() + if use_fp16: + model = model.half() + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True) + + return model, tokenizer + + +def pooling( + pooler_output, + last_hidden_state, + attention_mask = None, + pooling_method = "mean" + ): + if pooling_method == "mean": + last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) + return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + elif pooling_method == "cls": + return last_hidden_state[:, 0] + elif pooling_method == "pooler": + return pooler_output + else: + raise NotImplementedError("Pooling method not implemented!") + + +def load_corpus(corpus_path: str): + corpus = datasets.load_dataset( + 'json', + data_files=corpus_path, + split="train", + num_proc=4) + return corpus + + +class Index_Builder: + r"""A tool class used to build an index used in retrieval. + + """ + def __init__( + self, + retrieval_method, + model_path, + corpus_path, + save_dir, + max_length, + batch_size, + use_fp16, + pooling_method, + faiss_type=None, + embedding_path=None, + save_embedding=False, + faiss_gpu=False + ): + + self.retrieval_method = retrieval_method.lower() + self.model_path = model_path + self.corpus_path = corpus_path + self.save_dir = save_dir + self.max_length = max_length + self.batch_size = batch_size + self.use_fp16 = use_fp16 + self.pooling_method = pooling_method + self.faiss_type = faiss_type if faiss_type is not None else 'Flat' + self.embedding_path = embedding_path + self.save_embedding = save_embedding + self.faiss_gpu = faiss_gpu + + self.gpu_num = torch.cuda.device_count() + # prepare save dir + print(self.save_dir) + if not os.path.exists(self.save_dir): + os.makedirs(self.save_dir) + else: + if not self._check_dir(self.save_dir): + warnings.warn("Some files already exists in save dir and may be overwritten.", UserWarning) + + self.index_save_path = os.path.join(self.save_dir, f"{self.retrieval_method}_{self.faiss_type}.index") + + self.embedding_save_path = os.path.join(self.save_dir, f"emb_{self.retrieval_method}.memmap") + + self.corpus = load_corpus(self.corpus_path) + + print("Finish loading...") + @staticmethod + def _check_dir(dir_path): + r"""Check if the dir path exists and if there is content. + + """ + + if os.path.isdir(dir_path): + if len(os.listdir(dir_path)) > 0: + return False + else: + os.makedirs(dir_path, exist_ok=True) + return True + + def build_index(self): + r"""Constructing different indexes based on selective retrieval method. + + """ + if self.retrieval_method == "bm25": + self.build_bm25_index() + else: + self.build_dense_index() + + def build_bm25_index(self): + """Building BM25 index based on Pyserini library. + + Reference: https://github.com/castorini/pyserini/blob/master/docs/usage-index.md#building-a-bm25-index-direct-java-implementation + """ + + # to use pyserini pipeline, we first need to place jsonl file in the folder + self.save_dir = os.path.join(self.save_dir, "bm25") + os.makedirs(self.save_dir, exist_ok=True) + temp_dir = self.save_dir + "/temp" + temp_file_path = temp_dir + "/temp.jsonl" + os.makedirs(temp_dir) + + # if self.have_contents: + # shutil.copyfile(self.corpus_path, temp_file_path) + # else: + # with open(temp_file_path, "w") as f: + # for item in self.corpus: + # f.write(json.dumps(item) + "\n") + shutil.copyfile(self.corpus_path, temp_file_path) + + print("Start building bm25 index...") + pyserini_args = ["--collection", "JsonCollection", + "--input", temp_dir, + "--index", self.save_dir, + "--generator", "DefaultLuceneDocumentGenerator", + "--threads", "1"] + + subprocess.run(["python", "-m", "pyserini.index.lucene"] + pyserini_args) + + shutil.rmtree(temp_dir) + + print("Finish!") + + def _load_embedding(self, embedding_path, corpus_size, hidden_size): + all_embeddings = np.memmap( + embedding_path, + mode="r", + dtype=np.float32 + ).reshape(corpus_size, hidden_size) + return all_embeddings + + def _save_embedding(self, all_embeddings): + memmap = np.memmap( + self.embedding_save_path, + shape=all_embeddings.shape, + mode="w+", + dtype=all_embeddings.dtype + ) + length = all_embeddings.shape[0] + # add in batch + save_batch_size = 10000 + if length > save_batch_size: + for i in tqdm(range(0, length, save_batch_size), leave=False, desc="Saving Embeddings"): + j = min(i + save_batch_size, length) + memmap[i: j] = all_embeddings[i: j] + else: + memmap[:] = all_embeddings + + def encode_all(self): + if self.gpu_num > 1: + print("Use multi gpu!") + self.encoder = torch.nn.DataParallel(self.encoder) + self.batch_size = self.batch_size * self.gpu_num + + all_embeddings = [] + + for start_idx in tqdm(range(0, len(self.corpus), self.batch_size), desc='Inference Embeddings:'): + + # batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title'] + # batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text'] + # batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)] + batch_data = self.corpus[start_idx:start_idx+self.batch_size]['contents'] + + if self.retrieval_method == "e5": + batch_data = [f"passage: {doc}" for doc in batch_data] + + inputs = self.tokenizer( + batch_data, + padding=True, + truncation=True, + return_tensors='pt', + max_length=self.max_length, + ).to('cuda') + + inputs = {k: v.cuda() for k, v in inputs.items()} + + #TODO: support encoder-only T5 model + if "T5" in type(self.encoder).__name__: + # T5-based retrieval model + decoder_input_ids = torch.zeros( + (inputs['input_ids'].shape[0], 1), dtype=torch.long + ).to(inputs['input_ids'].device) + output = self.encoder( + **inputs, decoder_input_ids=decoder_input_ids, return_dict=True + ) + embeddings = output.last_hidden_state[:, 0, :] + + else: + output = self.encoder(**inputs, return_dict=True) + embeddings = pooling(output.pooler_output, + output.last_hidden_state, + inputs['attention_mask'], + self.pooling_method) + if "dpr" not in self.retrieval_method: + embeddings = torch.nn.functional.normalize(embeddings, dim=-1) + + embeddings = cast(torch.Tensor, embeddings) + embeddings = embeddings.detach().cpu().numpy() + all_embeddings.append(embeddings) + + all_embeddings = np.concatenate(all_embeddings, axis=0) + all_embeddings = all_embeddings.astype(np.float32) + + return all_embeddings + + @torch.no_grad() + def build_dense_index(self): + """Obtain the representation of documents based on the embedding model(BERT-based) and + construct a faiss index. + """ + + if os.path.exists(self.index_save_path): + print("The index file already exists and will be overwritten.") + + self.encoder, self.tokenizer = load_model(model_path = self.model_path, + use_fp16 = self.use_fp16) + if self.embedding_path is not None: + hidden_size = self.encoder.config.hidden_size + corpus_size = len(self.corpus) + all_embeddings = self._load_embedding(self.embedding_path, corpus_size, hidden_size) + else: + all_embeddings = self.encode_all() + if self.save_embedding: + self._save_embedding(all_embeddings) + del self.corpus + + # build index + print("Creating index") + dim = all_embeddings.shape[-1] + faiss_index = faiss.index_factory(dim, self.faiss_type, faiss.METRIC_INNER_PRODUCT) + + if self.faiss_gpu: + co = faiss.GpuMultipleClonerOptions() + co.useFloat16 = True + co.shard = True + faiss_index = faiss.index_cpu_to_all_gpus(faiss_index, co) + if not faiss_index.is_trained: + faiss_index.train(all_embeddings) + faiss_index.add(all_embeddings) + faiss_index = faiss.index_gpu_to_cpu(faiss_index) + else: + if not faiss_index.is_trained: + faiss_index.train(all_embeddings) + faiss_index.add(all_embeddings) + + faiss.write_index(faiss_index, self.index_save_path) + print("Finish!") + + +MODEL2POOLING = { + "e5": "mean", + "bge": "cls", + "contriever": "mean", + 'jina': 'mean' +} + + +def main(): + parser = argparse.ArgumentParser(description = "Creating index.") + + # Basic parameters + parser.add_argument('--retrieval_method', type=str) + parser.add_argument('--model_path', type=str, default=None) + parser.add_argument('--corpus_path', type=str) + parser.add_argument('--save_dir', default= 'indexes/',type=str) + + # Parameters for building dense index + parser.add_argument('--max_length', type=int, default=180) + parser.add_argument('--batch_size', type=int, default=512) + parser.add_argument('--use_fp16', default=False, action='store_true') + parser.add_argument('--pooling_method', type=str, default=None) + parser.add_argument('--faiss_type',default=None,type=str) + parser.add_argument('--embedding_path', default=None, type=str) + parser.add_argument('--save_embedding', action='store_true', default=False) + parser.add_argument('--faiss_gpu', default=False, action='store_true') + + args = parser.parse_args() + + if args.pooling_method is None: + pooling_method = 'mean' + for k,v in MODEL2POOLING.items(): + if k in args.retrieval_method.lower(): + pooling_method = v + break + else: + if args.pooling_method not in ['mean','cls','pooler']: + raise NotImplementedError + else: + pooling_method = args.pooling_method + + + index_builder = Index_Builder( + retrieval_method = args.retrieval_method, + model_path = args.model_path, + corpus_path = args.corpus_path, + save_dir = args.save_dir, + max_length = args.max_length, + batch_size = args.batch_size, + use_fp16 = args.use_fp16, + pooling_method = pooling_method, + faiss_type = args.faiss_type, + embedding_path = args.embedding_path, + save_embedding = args.save_embedding, + faiss_gpu = args.faiss_gpu + ) + index_builder.build_index() + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py b/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py new file mode 100644 index 0000000000000000000000000000000000000000..9edabe881bbc685786d6dde292ae8e72b0216aae --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py @@ -0,0 +1,161 @@ +import argparse +from collections import defaultdict +from typing import Optional +from dataclasses import dataclass, field + +from sentence_transformers import CrossEncoder +import torch +from transformers import HfArgumentParser +import numpy as np + +import uvicorn +from fastapi import FastAPI +from pydantic import BaseModel + + +class BaseCrossEncoder: + def __init__(self, model, batch_size=32, device="cuda"): + self.model = model + self.batch_size = batch_size + self.model.to(device) + + def _passage_to_string(self, doc_item): + if "document" not in doc_item: + content = doc_item['contents'] + else: + content = doc_item['document']['contents'] + title = content.split("\n")[0] + text = "\n".join(content.split("\n")[1:]) + + return f"(Title: {title}) {text}" + + def rerank(self, + queries: list[str], + documents: list[list[dict]]): + """ + Assume documents is a list of list of dicts, where each dict is a document with keys "id" and "contents". + This asumption is made to be consistent with the output of the retrieval server. + """ + assert len(queries) == len(documents) + + pairs = [] + qids = [] + for qid, query in enumerate(queries): + for document in documents: + for doc_item in document: + doc = self._passage_to_string(doc_item) + pairs.append((query, doc)) + qids.append(qid) + + scores = self._predict(pairs) + query_to_doc_scores = defaultdict(list) + + assert len(scores) == len(pairs) == len(qids) + for i in range(len(pairs)): + query, doc = pairs[i] + score = scores[i] + qid = qids[i] + query_to_doc_scores[qid].append((doc, score)) + + sorted_query_to_doc_scores = {} + for query, doc_scores in query_to_doc_scores.items(): + sorted_query_to_doc_scores[query] = sorted(doc_scores, key=lambda x: x[1], reverse=True) + + return sorted_query_to_doc_scores + + def _predict(self, pairs: list[tuple[str, str]]): + raise NotImplementedError + + @classmethod + def load(cls, model_name_or_path, **kwargs): + raise NotImplementedError + + +class SentenceTransformerCrossEncoder(BaseCrossEncoder): + def __init__(self, model, batch_size=32, device="cuda"): + super().__init__(model, batch_size, device) + + def _predict(self, pairs: list[tuple[str, str]]): + scores = self.model.predict(pairs, batch_size=self.batch_size) + scores = scores.tolist() if isinstance(scores, torch.Tensor) or isinstance(scores, np.ndarray) else scores + return scores + + @classmethod + def load(cls, model_name_or_path, **kwargs): + model = CrossEncoder(model_name_or_path) + return cls(model, **kwargs) + + +class RerankRequest(BaseModel): + queries: list[str] + documents: list[list[dict]] + rerank_topk: Optional[int] = None + return_scores: bool = False + + +@dataclass +class RerankerArguments: + max_length: int = field(default=512) + rerank_topk: int = field(default=3) + rerank_model_name_or_path: str = field(default="cross-encoder/ms-marco-MiniLM-L12-v2") + batch_size: int = field(default=32) + reranker_type: str = field(default="sentence_transformer") + +def get_reranker(config): + if config.reranker_type == "sentence_transformer": + return SentenceTransformerCrossEncoder.load( + config.rerank_model_name_or_path, + batch_size=config.batch_size, + device="cuda" if torch.cuda.is_available() else "cpu" + ) + else: + raise ValueError(f"Unknown reranker type: {config.reranker_type}") + + +app = FastAPI() + +@app.post("/rerank") +def rerank_endpoint(request: RerankRequest): + """ + Endpoint that accepts queries and performs retrieval. + Input format: + { + "queries": ["What is Python?", "Tell me about neural networks."], + "documents": [[doc_item_1, ..., doc_item_k], [doc_item_1, ..., doc_item_k]], + "rerank_topk": 3, + "return_scores": true + } + """ + if not request.rerank_topk: + request.rerank_topk = config.rerank_topk # fallback to default + + # Perform batch re reranking + # doc_scores already sorted by score + query_to_doc_scores = reranker.rerank(request.queries, request.documents) + + # Format response + resp = [] + for _, doc_scores in query_to_doc_scores.items(): + doc_scores = doc_scores[:request.rerank_topk] + if request.return_scores: + combined = [] + for doc, score in doc_scores: + combined.append({"document": doc, "score": score}) + resp.append(combined) + else: + resp.append([doc for doc, _ in doc_scores]) + return {"result": resp} + + +if __name__ == "__main__": + + # 1) Build a config (could also parse from arguments). + # In real usage, you'd parse your CLI arguments or environment variables. + parser = HfArgumentParser((RerankerArguments)) + config = parser.parse_args_into_dataclasses()[0] + + # 2) Instantiate a global retriever so it is loaded once and reused. + reranker = get_reranker(config) + + # 3) Launch the server. By default, it listens on http://127.0.0.1:8000 + uvicorn.run(app, host="0.0.0.0", port=6980) diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..125643a7bea6e83c612fe6ed02e25ea1a7464670 --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py @@ -0,0 +1,368 @@ +import json +import os +import warnings +from typing import List, Dict +import functools +from tqdm import tqdm +from multiprocessing import Pool +import faiss +import torch +import numpy as np +from transformers import AutoConfig, AutoTokenizer, AutoModel +import argparse +import datasets + + +def load_corpus(corpus_path: str): + corpus = datasets.load_dataset( + 'json', + data_files=corpus_path, + split="train", + num_proc=4) + return corpus + + +def read_jsonl(file_path): + data = [] + + with open(file_path, "r") as f: + readin = f.readlines() + for line in readin: + data.append(json.loads(line)) + return data + + +def load_docs(corpus, doc_idxs): + results = [corpus[int(idx)] for idx in doc_idxs] + + return results + + +def load_model( + model_path: str, + use_fp16: bool = False + ): + model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = AutoModel.from_pretrained(model_path, trust_remote_code=True) + model.eval() + model.cuda() + if use_fp16: + model = model.half() + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True) + + return model, tokenizer + + +def pooling( + pooler_output, + last_hidden_state, + attention_mask = None, + pooling_method = "mean" + ): + if pooling_method == "mean": + last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) + return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + elif pooling_method == "cls": + return last_hidden_state[:, 0] + elif pooling_method == "pooler": + return pooler_output + else: + raise NotImplementedError("Pooling method not implemented!") + + +class Encoder: + def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16): + self.model_name = model_name + self.model_path = model_path + self.pooling_method = pooling_method + self.max_length = max_length + self.use_fp16 = use_fp16 + + self.model, self.tokenizer = load_model(model_path=model_path, + use_fp16=use_fp16) + + @torch.no_grad() + def encode(self, query_list: List[str], is_query=True) -> np.ndarray: + # processing query for different encoders + if isinstance(query_list, str): + query_list = [query_list] + + if "e5" in self.model_name.lower(): + if is_query: + query_list = [f"query: {query}" for query in query_list] + else: + query_list = [f"passage: {query}" for query in query_list] + + if "bge" in self.model_name.lower(): + if is_query: + query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list] + + inputs = self.tokenizer(query_list, + max_length=self.max_length, + padding=True, + truncation=True, + return_tensors="pt" + ) + inputs = {k: v.cuda() for k, v in inputs.items()} + + if "T5" in type(self.model).__name__: + # T5-based retrieval model + decoder_input_ids = torch.zeros( + (inputs['input_ids'].shape[0], 1), dtype=torch.long + ).to(inputs['input_ids'].device) + output = self.model( + **inputs, decoder_input_ids=decoder_input_ids, return_dict=True + ) + query_emb = output.last_hidden_state[:, 0, :] + + else: + output = self.model(**inputs, return_dict=True) + query_emb = pooling(output.pooler_output, + output.last_hidden_state, + inputs['attention_mask'], + self.pooling_method) + if "dpr" not in self.model_name.lower(): + query_emb = torch.nn.functional.normalize(query_emb, dim=-1) + + query_emb = query_emb.detach().cpu().numpy() + query_emb = query_emb.astype(np.float32, order="C") + return query_emb + + +class BaseRetriever: + """Base object for all retrievers.""" + + def __init__(self, config): + self.config = config + self.retrieval_method = config.retrieval_method + self.topk = config.retrieval_topk + + self.index_path = config.index_path + self.corpus_path = config.corpus_path + + # self.cache_save_path = os.path.join(config.save_dir, 'retrieval_cache.json') + + def _search(self, query: str, num: int, return_score:bool) -> List[Dict[str, str]]: + r"""Retrieve topk relevant documents in corpus. + Return: + list: contains information related to the document, including: + contents: used for building index + title: (if provided) + text: (if provided) + """ + pass + + def _batch_search(self, query_list, num, return_score): + pass + + def search(self, *args, **kwargs): + return self._search(*args, **kwargs) + + def batch_search(self, *args, **kwargs): + return self._batch_search(*args, **kwargs) + + +class BM25Retriever(BaseRetriever): + r"""BM25 retriever based on pre-built pyserini index.""" + + def __init__(self, config): + super().__init__(config) + from pyserini.search.lucene import LuceneSearcher + self.searcher = LuceneSearcher(self.index_path) + self.contain_doc = self._check_contain_doc() + if not self.contain_doc: + self.corpus = load_corpus(self.corpus_path) + self.max_process_num = 8 + + def _check_contain_doc(self): + r"""Check if the index contains document content + """ + return self.searcher.doc(0).raw() is not None + + def _search(self, query: str, num: int = None, return_score = False) -> List[Dict[str, str]]: + if num is None: + num = self.topk + + hits = self.searcher.search(query, num) + if len(hits) < 1: + if return_score: + return [],[] + else: + return [] + + scores = [hit.score for hit in hits] + if len(hits) < num: + warnings.warn('Not enough documents retrieved!') + else: + hits = hits[:num] + + if self.contain_doc: + all_contents = [json.loads(self.searcher.doc(hit.docid).raw())['contents'] for hit in hits] + results = [{'title': content.split("\n")[0].strip("\""), + 'text': "\n".join(content.split("\n")[1:]), + 'contents': content} for content in all_contents] + else: + results = load_docs(self.corpus, [hit.docid for hit in hits]) + + if return_score: + return results, scores + else: + return results + + def _batch_search(self, query_list, num: int = None, return_score = False): + # TODO: modify batch method + results = [] + scores = [] + for query in query_list: + item_result, item_score = self._search(query, num,True) + results.append(item_result) + scores.append(item_score) + + if return_score: + return results, scores + else: + return results + +def get_available_gpu_memory(): + memory_info = [] + for i in range(torch.cuda.device_count()): + total_memory = torch.cuda.get_device_properties(i).total_memory + allocated_memory = torch.cuda.memory_allocated(i) + free_memory = total_memory - allocated_memory + memory_info.append((i, free_memory / 1e9)) # Convert to GB + return memory_info + + +class DenseRetriever(BaseRetriever): + r"""Dense retriever based on pre-built faiss index.""" + + def __init__(self, config: dict): + super().__init__(config) + self.index = faiss.read_index(self.index_path) + if config.faiss_gpu: + co = faiss.GpuMultipleClonerOptions() + co.useFloat16 = True + co.shard = True + self.index = faiss.index_cpu_to_all_gpus(self.index, co=co) + # self.index = faiss.index_cpu_to_all_gpus(self.index) + + self.corpus = load_corpus(self.corpus_path) + self.encoder = Encoder( + model_name = self.retrieval_method, + model_path = config.retrieval_model_path, + pooling_method = config.retrieval_pooling_method, + max_length = config.retrieval_query_max_length, + use_fp16 = config.retrieval_use_fp16 + ) + self.topk = config.retrieval_topk + self.batch_size = self.config.retrieval_batch_size + + def _search(self, query: str, num: int = None, return_score = False): + if num is None: + num = self.topk + query_emb = self.encoder.encode(query) + scores, idxs = self.index.search(query_emb, k=num) + idxs = idxs[0] + scores = scores[0] + + results = load_docs(self.corpus, idxs) + if return_score: + return results, scores + else: + return results + + def _batch_search(self, query_list: List[str], num: int = None, return_score = False): + if isinstance(query_list, str): + query_list = [query_list] + if num is None: + num = self.topk + + batch_size = self.batch_size + + results = [] + scores = [] + + for start_idx in tqdm(range(0, len(query_list), batch_size), desc='Retrieval process: '): + query_batch = query_list[start_idx:start_idx + batch_size] + + # from time import time + # a = time() + batch_emb = self.encoder.encode(query_batch) + # b = time() + # print(f'################### encode time {b-a} #####################') + batch_scores, batch_idxs = self.index.search(batch_emb, k=num) + batch_scores = batch_scores.tolist() + batch_idxs = batch_idxs.tolist() + # print(f'################### search time {time()-b} #####################') + # exit() + + flat_idxs = sum(batch_idxs, []) + batch_results = load_docs(self.corpus, flat_idxs) + batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))] + + scores.extend(batch_scores) + results.extend(batch_results) + + if return_score: + return results, scores + else: + return results + +def get_retriever(config): + r"""Automatically select retriever class based on config's retrieval method + + Args: + config (dict): configuration with 'retrieval_method' key + + Returns: + Retriever: retriever instance + """ + if config.retrieval_method == "bm25": + return BM25Retriever(config) + else: + return DenseRetriever(config) + + +def get_dataset(config): + """Load dataset from config.""" + + split_path = os.path.join(config.dataset_path, f'{config.data_split}.jsonl') + return read_jsonl(split_path) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description = "Retrieval") + + # Basic parameters + parser.add_argument('--retrieval_method', type=str) + parser.add_argument('--retrieval_topk', type=int, default=10) + parser.add_argument('--index_path', type=str, default=None) + parser.add_argument('--corpus_path', type=str) + parser.add_argument('--dataset_path', default=None, type=str) + + parser.add_argument('--faiss_gpu', default=True, type=bool) + parser.add_argument('--data_split', default="train", type=str) + + parser.add_argument('--retrieval_model_path', type=str, default=None) + parser.add_argument('--retrieval_pooling_method', default='mean', type=str) + parser.add_argument('--retrieval_query_max_length', default=256, type=str) + parser.add_argument('--retrieval_use_fp16', action='store_true', default=False) + parser.add_argument('--retrieval_batch_size', default=512, type=int) + + args = parser.parse_args() + + args.index_path = os.path.join(args.index_path, f'{args.retrieval_method}_Flat.index') if args.retrieval_method != 'bm25' else os.path.join(args.index_path, 'bm25') + + # load dataset + all_split = get_dataset(args) + + input_query = [sample['question'] for sample in all_split[:512]] + + # initialize the retriever and conduct retrieval + retriever = get_retriever(args) + print('Start Retrieving ...') + results, scores = retriever.batch_search(input_query, return_score=True) + + # from IPython import embed + # embed() diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh new file mode 100644 index 0000000000000000000000000000000000000000..5326ea2840f3a816540fea28f8b557ae02291248 --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh @@ -0,0 +1,25 @@ + +DATA_NAME=nq + +DATASET_PATH="/home/peterjin/mnt/data/$DATA_NAME" + +SPLIT='test' +TOPK=3 + +INDEX_PATH=/home/peterjin/mnt/index/wiki-18 +CORPUS_PATH=/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl +SAVE_NAME=e5_${TOPK}_wiki18.json + +# INDEX_PATH=/home/peterjin/rm_retrieval_corpus/index/wiki-21 +# CORPUS_PATH=/home/peterjin/rm_retrieval_corpus/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl +# SAVE_NAME=e5_${TOPK}_wiki21.json + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python retrieval.py --retrieval_method e5 \ + --retrieval_topk $TOPK \ + --index_path $INDEX_PATH \ + --corpus_path $CORPUS_PATH \ + --dataset_path $DATASET_PATH \ + --data_split $SPLIT \ + --retrieval_model_path "intfloat/e5-base-v2" \ + --retrieval_pooling_method "mean" \ + --retrieval_batch_size 512 \ diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py new file mode 100644 index 0000000000000000000000000000000000000000..de0a4df6d7adc71c8366938572898c6116276c0e --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py @@ -0,0 +1,23 @@ +import requests + +# URL for your local FastAPI server +url = "http://127.0.0.1:8000/retrieve" + +# Example payload +payload = { + "queries": ["What is the capital of France?", "Explain neural networks."] * 200, + "topk": 5, + "return_scores": True +} + +# Send POST request +response = requests.post(url, json=payload) + +# Raise an exception if the request failed +response.raise_for_status() + +# Get the JSON response +retrieved_data = response.json() + +print("Response from server:") +print(retrieved_data) diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py new file mode 100644 index 0000000000000000000000000000000000000000..a9e14f7bcde1c8c50076ccf464e5e5acdc1bdcff --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py @@ -0,0 +1,123 @@ +# pip install -U sentence-transformers +import os +import re +import argparse +from dataclasses import dataclass, field +from typing import List, Optional +from collections import defaultdict + +import torch +import numpy as np +from fastapi import FastAPI +from pydantic import BaseModel +from sentence_transformers import CrossEncoder + +from retrieval_server import get_retriever, Config as RetrieverConfig +from rerank_server import SentenceTransformerCrossEncoder + +app = FastAPI() + +def convert_title_format(text): + # Use regex to extract the title and the content + match = re.match(r'\(Title:\s*([^)]+)\)\s*(.+)', text, re.DOTALL) + if match: + title, content = match.groups() + return f'\"{title}\"\n{content}' + else: + return text + +# ----------- Combined Request Schema ----------- +class SearchRequest(BaseModel): + queries: List[str] + topk_retrieval: Optional[int] = 10 + topk_rerank: Optional[int] = 3 + return_scores: bool = False + +# ----------- Reranker Config Schema ----------- +@dataclass +class RerankerArguments: + max_length: int = field(default=512) + rerank_topk: int = field(default=3) + rerank_model_name_or_path: str = field(default="cross-encoder/ms-marco-MiniLM-L12-v2") + batch_size: int = field(default=32) + reranker_type: str = field(default="sentence_transformer") + +def get_reranker(config): + if config.reranker_type == "sentence_transformer": + return SentenceTransformerCrossEncoder.load( + config.rerank_model_name_or_path, + batch_size=config.batch_size, + device="cuda" if torch.cuda.is_available() else "cpu" + ) + else: + raise ValueError(f"Unknown reranker type: {config.reranker_type}") + +# ----------- Endpoint ----------- +@app.post("/retrieve") +def search_endpoint(request: SearchRequest): + # Step 1: Retrieve documents + retrieved_docs = retriever.batch_search( + query_list=request.queries, + num=request.topk_retrieval, + return_score=False + ) + + # Step 2: Rerank + reranked = reranker.rerank(request.queries, retrieved_docs) + + # Step 3: Format response + response = [] + for i, doc_scores in reranked.items(): + doc_scores = doc_scores[:request.topk_rerank] + if request.return_scores: + combined = [] + for doc, score in doc_scores: + combined.append({"document": convert_title_format(doc), "score": score}) + response.append(combined) + else: + response.append([convert_title_format(doc) for doc, _ in doc_scores]) + + return {"result": response} + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Launch the local faiss retriever.") + # retriever + parser.add_argument("--index_path", type=str, default="/home/peterjin/mnt/index/wiki-18/e5_Flat.index", help="Corpus indexing file.") + parser.add_argument("--corpus_path", type=str, default="/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl", help="Local corpus file.") + parser.add_argument("--retrieval_topk", type=int, default=10, help="Number of retrieved passages for one query.") + parser.add_argument("--retriever_name", type=str, default="e5", help="Name of the retriever model.") + parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Path of the retriever model.") + parser.add_argument('--faiss_gpu', action='store_true', help='Use GPU for computation') + # reranker + parser.add_argument("--reranking_topk", type=int, default=3, help="Number of reranked passages for one query.") + parser.add_argument("--reranker_model", type=str, default="cross-encoder/ms-marco-MiniLM-L12-v2", help="Path of the reranker model.") + parser.add_argument("--reranker_batch_size", type=int, default=32, help="Batch size for the reranker inference.") + + args = parser.parse_args() + + # ----------- Load Retriever and Reranker ----------- + retriever_config = RetrieverConfig( + retrieval_method = args.retriever_name, + index_path=args.index_path, + corpus_path=args.corpus_path, + retrieval_topk=args.retrieval_topk, + faiss_gpu=args.faiss_gpu, + retrieval_model_path=args.retriever_model, + retrieval_pooling_method="mean", + retrieval_query_max_length=256, + retrieval_use_fp16=True, + retrieval_batch_size=512, + ) + retriever = get_retriever(retriever_config) + + reranker_config = RerankerArguments( + rerank_topk = args.reranking_topk, + rerank_model_name_or_path = args.reranker_model, + batch_size = args.reranker_batch_size, + ) + reranker = get_reranker(reranker_config) + + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py new file mode 100644 index 0000000000000000000000000000000000000000..f39698980c1da3abdf715dcdd78916cf1dbdc935 --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py @@ -0,0 +1,392 @@ +import json +import os +import warnings +from typing import List, Dict, Optional +import argparse + +import faiss +import torch +import numpy as np +from transformers import AutoConfig, AutoTokenizer, AutoModel +from tqdm import tqdm +import datasets + +import uvicorn +from fastapi import FastAPI +from pydantic import BaseModel + +def load_corpus(corpus_path: str): + corpus = datasets.load_dataset( + 'json', + data_files=corpus_path, + split="train", + num_proc=4 + ) + return corpus + +def read_jsonl(file_path): + data = [] + with open(file_path, "r") as f: + for line in f: + data.append(json.loads(line)) + return data + +def load_docs(corpus, doc_idxs): + results = [corpus[int(idx)] for idx in doc_idxs] + return results + +def load_model(model_path: str, use_fp16: bool = False): + model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = AutoModel.from_pretrained(model_path, trust_remote_code=True) + model.eval() + model.cuda() + if use_fp16: + model = model.half() + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True) + return model, tokenizer + +def pooling( + pooler_output, + last_hidden_state, + attention_mask = None, + pooling_method = "mean" +): + if pooling_method == "mean": + last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) + return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + elif pooling_method == "cls": + return last_hidden_state[:, 0] + elif pooling_method == "pooler": + return pooler_output + else: + raise NotImplementedError("Pooling method not implemented!") + +class Encoder: + def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16): + self.model_name = model_name + self.model_path = model_path + self.pooling_method = pooling_method + self.max_length = max_length + self.use_fp16 = use_fp16 + + self.model, self.tokenizer = load_model(model_path=model_path, use_fp16=use_fp16) + self.model.eval() + + @torch.no_grad() + def encode(self, query_list: List[str], is_query=True) -> np.ndarray: + # processing query for different encoders + if isinstance(query_list, str): + query_list = [query_list] + + if "e5" in self.model_name.lower(): + if is_query: + query_list = [f"query: {query}" for query in query_list] + else: + query_list = [f"passage: {query}" for query in query_list] + + if "bge" in self.model_name.lower(): + if is_query: + query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list] + + inputs = self.tokenizer(query_list, + max_length=self.max_length, + padding=True, + truncation=True, + return_tensors="pt" + ) + inputs = {k: v.cuda() for k, v in inputs.items()} + + if "T5" in type(self.model).__name__: + # T5-based retrieval model + decoder_input_ids = torch.zeros( + (inputs['input_ids'].shape[0], 1), dtype=torch.long + ).to(inputs['input_ids'].device) + output = self.model( + **inputs, decoder_input_ids=decoder_input_ids, return_dict=True + ) + query_emb = output.last_hidden_state[:, 0, :] + else: + output = self.model(**inputs, return_dict=True) + query_emb = pooling(output.pooler_output, + output.last_hidden_state, + inputs['attention_mask'], + self.pooling_method) + if "dpr" not in self.model_name.lower(): + query_emb = torch.nn.functional.normalize(query_emb, dim=-1) + + query_emb = query_emb.detach().cpu().numpy() + query_emb = query_emb.astype(np.float32, order="C") + + del inputs, output + torch.cuda.empty_cache() + + return query_emb + +class BaseRetriever: + def __init__(self, config): + self.config = config + self.retrieval_method = config.retrieval_method + self.topk = config.retrieval_topk + + self.index_path = config.index_path + self.corpus_path = config.corpus_path + + def _search(self, query: str, num: int, return_score: bool): + raise NotImplementedError + + def _batch_search(self, query_list: List[str], num: int, return_score: bool): + raise NotImplementedError + + def search(self, query: str, num: int = None, return_score: bool = False): + return self._search(query, num, return_score) + + def batch_search(self, query_list: List[str], num: int = None, return_score: bool = False): + return self._batch_search(query_list, num, return_score) + +class BM25Retriever(BaseRetriever): + def __init__(self, config): + super().__init__(config) + from pyserini.search.lucene import LuceneSearcher + self.searcher = LuceneSearcher(self.index_path) + self.contain_doc = self._check_contain_doc() + if not self.contain_doc: + self.corpus = load_corpus(self.corpus_path) + self.max_process_num = 8 + + def _check_contain_doc(self): + return self.searcher.doc(0).raw() is not None + + def _search(self, query: str, num: int = None, return_score: bool = False): + if num is None: + num = self.topk + hits = self.searcher.search(query, num) + if len(hits) < 1: + if return_score: + return [], [] + else: + return [] + scores = [hit.score for hit in hits] + if len(hits) < num: + warnings.warn('Not enough documents retrieved!') + else: + hits = hits[:num] + + if self.contain_doc: + all_contents = [ + json.loads(self.searcher.doc(hit.docid).raw())['contents'] + for hit in hits + ] + results = [ + { + 'title': content.split("\n")[0].strip("\""), + 'text': "\n".join(content.split("\n")[1:]), + 'contents': content + } + for content in all_contents + ] + else: + results = load_docs(self.corpus, [hit.docid for hit in hits]) + + if return_score: + return results, scores + else: + return results + + def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False): + results = [] + scores = [] + for query in query_list: + item_result, item_score = self._search(query, num, True) + results.append(item_result) + scores.append(item_score) + if return_score: + return results, scores + else: + return results + +class DenseRetriever(BaseRetriever): + def __init__(self, config): + super().__init__(config) + self.index = faiss.read_index(self.index_path) + if config.faiss_gpu: + co = faiss.GpuMultipleClonerOptions() + co.useFloat16 = True + co.shard = True + self.index = faiss.index_cpu_to_all_gpus(self.index, co=co) + + self.corpus = load_corpus(self.corpus_path) + self.encoder = Encoder( + model_name = self.retrieval_method, + model_path = config.retrieval_model_path, + pooling_method = config.retrieval_pooling_method, + max_length = config.retrieval_query_max_length, + use_fp16 = config.retrieval_use_fp16 + ) + self.topk = config.retrieval_topk + self.batch_size = config.retrieval_batch_size + + def _search(self, query: str, num: int = None, return_score: bool = False): + if num is None: + num = self.topk + query_emb = self.encoder.encode(query) + scores, idxs = self.index.search(query_emb, k=num) + idxs = idxs[0] + scores = scores[0] + results = load_docs(self.corpus, idxs) + if return_score: + return results, scores.tolist() + else: + return results + + def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False): + if isinstance(query_list, str): + query_list = [query_list] + if num is None: + num = self.topk + + results = [] + scores = [] + for start_idx in tqdm(range(0, len(query_list), self.batch_size), desc='Retrieval process: '): + query_batch = query_list[start_idx:start_idx + self.batch_size] + batch_emb = self.encoder.encode(query_batch) + batch_scores, batch_idxs = self.index.search(batch_emb, k=num) + batch_scores = batch_scores.tolist() + batch_idxs = batch_idxs.tolist() + + # load_docs is not vectorized, but is a python list approach + flat_idxs = sum(batch_idxs, []) + batch_results = load_docs(self.corpus, flat_idxs) + # chunk them back + batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))] + + results.extend(batch_results) + scores.extend(batch_scores) + + del batch_emb, batch_scores, batch_idxs, query_batch, flat_idxs, batch_results + torch.cuda.empty_cache() + + if return_score: + return results, scores + else: + return results + +def get_retriever(config): + if config.retrieval_method == "bm25": + return BM25Retriever(config) + else: + return DenseRetriever(config) + + +##################################### +# FastAPI server below +##################################### + +class Config: + """ + Minimal config class (simulating your argparse) + Replace this with your real arguments or load them dynamically. + """ + def __init__( + self, + retrieval_method: str = "bm25", + retrieval_topk: int = 10, + index_path: str = "./index/bm25", + corpus_path: str = "./data/corpus.jsonl", + dataset_path: str = "./data", + data_split: str = "train", + faiss_gpu: bool = True, + retrieval_model_path: str = "./model", + retrieval_pooling_method: str = "mean", + retrieval_query_max_length: int = 256, + retrieval_use_fp16: bool = False, + retrieval_batch_size: int = 128 + ): + self.retrieval_method = retrieval_method + self.retrieval_topk = retrieval_topk + self.index_path = index_path + self.corpus_path = corpus_path + self.dataset_path = dataset_path + self.data_split = data_split + self.faiss_gpu = faiss_gpu + self.retrieval_model_path = retrieval_model_path + self.retrieval_pooling_method = retrieval_pooling_method + self.retrieval_query_max_length = retrieval_query_max_length + self.retrieval_use_fp16 = retrieval_use_fp16 + self.retrieval_batch_size = retrieval_batch_size + + +class QueryRequest(BaseModel): + queries: List[str] + topk: Optional[int] = None + return_scores: bool = False + + +app = FastAPI() + +@app.post("/retrieve") +def retrieve_endpoint(request: QueryRequest): + """ + Endpoint that accepts queries and performs retrieval. + Input format: + { + "queries": ["What is Python?", "Tell me about neural networks."], + "topk": 3, + "return_scores": true + } + """ + if not request.topk: + request.topk = config.retrieval_topk # fallback to default + + # Perform batch retrieval + results, scores = retriever.batch_search( + query_list=request.queries, + num=request.topk, + return_score=request.return_scores + ) + + # Format response + resp = [] + for i, single_result in enumerate(results): + if request.return_scores: + # If scores are returned, combine them with results + combined = [] + for doc, score in zip(single_result, scores[i]): + combined.append({"document": doc, "score": score}) + resp.append(combined) + else: + resp.append(single_result) + return {"result": resp} + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Launch the local faiss retriever.") + parser.add_argument("--index_path", type=str, default="/home/peterjin/mnt/index/wiki-18/e5_Flat.index", help="Corpus indexing file.") + parser.add_argument("--corpus_path", type=str, default="/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl", help="Local corpus file.") + parser.add_argument("--topk", type=int, default=3, help="Number of retrieved passages for one query.") + parser.add_argument("--retriever_name", type=str, default="e5", help="Name of the retriever model.") + parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Path of the retriever model.") + parser.add_argument('--faiss_gpu', action='store_true', help='Use GPU for computation') + + args = parser.parse_args() + + # 1) Build a config (could also parse from arguments). + # In real usage, you'd parse your CLI arguments or environment variables. + config = Config( + retrieval_method = args.retriever_name, # or "dense" + index_path=args.index_path, + corpus_path=args.corpus_path, + retrieval_topk=args.topk, + faiss_gpu=args.faiss_gpu, + retrieval_model_path=args.retriever_model, + retrieval_pooling_method="mean", + retrieval_query_max_length=256, + retrieval_use_fp16=True, + retrieval_batch_size=512, + ) + + # 2) Instantiate a global retriever so it is loaded once and reused. + retriever = get_retriever(config) + + # 3) Launch the server. By default, it listens on http://127.0.0.1:8000 + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py b/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py new file mode 100644 index 0000000000000000000000000000000000000000..30a10de3fa44aa6af20a12417ed9cf215319ad6f --- /dev/null +++ b/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py @@ -0,0 +1,112 @@ +import os +import requests +from fastapi import FastAPI +from pydantic import BaseModel +from typing import List, Optional, Dict +from concurrent.futures import ThreadPoolExecutor +import argparse +import uvicorn + +parser = argparse.ArgumentParser(description="Launch online search server.") +parser.add_argument('--search_url', type=str, required=True, + help="URL for search engine (e.g. https://serpapi.com/search)") +parser.add_argument('--topk', type=int, default=3, + help="Number of results to return per query") +parser.add_argument('--serp_api_key', type=str, default=None, + help="SerpAPI key for online search") +parser.add_argument('--serp_engine', type=str, default="google", + help="SerpAPI engine for online search") +args = parser.parse_args() + +# --- Config --- +class OnlineSearchConfig: + def __init__( + self, + search_url: str = "https://serpapi.com/search", + topk: int = 3, + serp_api_key: Optional[str] = None, + serp_engine: Optional[str] = None, + ): + self.search_url = search_url + self.topk = topk + self.serp_api_key = serp_api_key + self.serp_engine = serp_engine + + +# --- Online Search Wrapper --- +class OnlineSearchEngine: + def __init__(self, config: OnlineSearchConfig): + self.config = config + + def _search_query(self, query: str): + params = { + "engine": self.config.serp_engine, + "q": query, + "api_key": self.config.serp_api_key, + } + response = requests.get(self.config.search_url, params=params) + return response.json() + + def batch_search(self, queries: List[str]): + results = [] + with ThreadPoolExecutor() as executor: + for result in executor.map(self._search_query, queries): + results.append(self._process_result(result)) + return results + + def _process_result(self, search_result: Dict): + results = [] + + answer_box = search_result.get('answer_box', {}) + if answer_box: + title = answer_box.get('title', 'No title.') + snippet = answer_box.get('snippet', 'No snippet available.') + results.append({ + 'document': {"contents": f'\"{title}\"\n{snippet}'}, + }) + + organic_results = search_result.get('organic_results', []) + for _, result in enumerate(organic_results[:self.config.topk]): + title = result.get('title', 'No title.') + snippet = result.get('snippet', 'No snippet available.') + results.append({ + 'document': {"contents": f'\"{title}\"\n{snippet}'}, + }) + + related_results = search_result.get('related_questions', []) + for _, result in enumerate(related_results[:self.config.topk]): + title = result.get('question', 'No title.') # question is the title here + snippet = result.get('snippet', 'No snippet available.') + results.append({ + 'document': {"contents": f'\"{title}\"\n{snippet}'}, + }) + + return results + + +# --- FastAPI Setup --- +app = FastAPI(title="Online Search Proxy Server") + +class SearchRequest(BaseModel): + queries: List[str] + +# Instantiate global config + engine +config = OnlineSearchConfig( + search_url=args.search_url, + topk=args.topk, + serp_api_key=args.serp_api_key, + serp_engine=args.serp_engine, +) +engine = OnlineSearchEngine(config) + +# --- Routes --- +@app.post("/retrieve") +def search_endpoint(request: SearchRequest): + results = engine.batch_search(request.queries) + return {"result": results} + +## return {"result": List[List[{'document': {"id": xx, "content": "title" + \n + "content"}, 'score': xx}]]} + +if __name__ == "__main__": + # 3) Launch the server. By default, it listens on http://127.0.0.1:8000 + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/code/RL_model/verl/Search-R1/verl.egg-info/SOURCES.txt b/code/RL_model/verl/Search-R1/verl.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9406e91a62af724dbb31fe4f07363c0b81bafc7 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl.egg-info/SOURCES.txt @@ -0,0 +1,190 @@ +LICENSE +README.md +pyproject.toml +setup.py +./search_r1/__init__.py +./search_r1/llm_agent/__init__.py +./search_r1/llm_agent/generation.py +./search_r1/llm_agent/tensor_helper.py +./verl/__init__.py +./verl/protocol.py +./verl/models/__init__.py +./verl/models/registry.py +./verl/models/weight_loader_registry.py +./verl/models/llama/__init__.py +./verl/models/llama/megatron/__init__.py +./verl/models/llama/megatron/modeling_llama_megatron.py +./verl/models/llama/megatron/checkpoint_utils/__init__.py +./verl/models/llama/megatron/checkpoint_utils/llama_loader.py +./verl/models/llama/megatron/checkpoint_utils/llama_saver.py +./verl/models/llama/megatron/layers/__init__.py +./verl/models/llama/megatron/layers/parallel_attention.py +./verl/models/llama/megatron/layers/parallel_decoder.py +./verl/models/llama/megatron/layers/parallel_linear.py +./verl/models/llama/megatron/layers/parallel_mlp.py +./verl/models/llama/megatron/layers/parallel_rmsnorm.py +./verl/models/transformers/__init__.py +./verl/models/transformers/llama.py +./verl/models/transformers/monkey_patch.py +./verl/models/transformers/qwen2.py +./verl/single_controller/__init__.py +./verl/single_controller/base/__init__.py +./verl/single_controller/base/decorator.py +./verl/single_controller/base/worker.py +./verl/single_controller/base/worker_group.py +./verl/single_controller/base/megatron/__init__.py +./verl/single_controller/base/megatron/worker.py +./verl/single_controller/base/megatron/worker_group.py +./verl/single_controller/base/register_center/__init__.py +./verl/single_controller/base/register_center/ray.py +./verl/single_controller/ray/__init__.py +./verl/single_controller/ray/base.py +./verl/single_controller/ray/megatron.py +./verl/third_party/__init__.py +./verl/third_party/vllm/__init__.py +./verl/third_party/vllm/vllm_v_0_3_1/__init__.py +./verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py +./verl/third_party/vllm/vllm_v_0_3_1/config.py +./verl/third_party/vllm/vllm_v_0_3_1/llm.py +./verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py +./verl/third_party/vllm/vllm_v_0_3_1/model_loader.py +./verl/third_party/vllm/vllm_v_0_3_1/model_runner.py +./verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py +./verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py +./verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py +./verl/third_party/vllm/vllm_v_0_3_1/worker.py +./verl/third_party/vllm/vllm_v_0_4_2/__init__.py +./verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py +./verl/third_party/vllm/vllm_v_0_4_2/config.py +./verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py +./verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py +./verl/third_party/vllm/vllm_v_0_4_2/llm.py +./verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py +./verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py +./verl/third_party/vllm/vllm_v_0_4_2/model_loader.py +./verl/third_party/vllm/vllm_v_0_4_2/model_runner.py +./verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py +./verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py +./verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py +./verl/third_party/vllm/vllm_v_0_4_2/worker.py +./verl/third_party/vllm/vllm_v_0_5_4/__init__.py +./verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py +./verl/third_party/vllm/vllm_v_0_5_4/config.py +./verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py +./verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py +./verl/third_party/vllm/vllm_v_0_5_4/llm.py +./verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py +./verl/third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py +./verl/third_party/vllm/vllm_v_0_5_4/model_loader.py +./verl/third_party/vllm/vllm_v_0_5_4/model_runner.py +./verl/third_party/vllm/vllm_v_0_5_4/parallel_state.py +./verl/third_party/vllm/vllm_v_0_5_4/spmd_gpu_executor.py +./verl/third_party/vllm/vllm_v_0_5_4/tokenizer.py +./verl/third_party/vllm/vllm_v_0_5_4/worker.py +./verl/third_party/vllm/vllm_v_0_6_3/__init__.py +./verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py +./verl/third_party/vllm/vllm_v_0_6_3/config.py +./verl/third_party/vllm/vllm_v_0_6_3/dtensor_weight_loaders.py +./verl/third_party/vllm/vllm_v_0_6_3/hf_weight_loader.py +./verl/third_party/vllm/vllm_v_0_6_3/llm.py +./verl/third_party/vllm/vllm_v_0_6_3/llm_engine_sp.py +./verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py +./verl/third_party/vllm/vllm_v_0_6_3/model_loader.py +./verl/third_party/vllm/vllm_v_0_6_3/model_runner.py +./verl/third_party/vllm/vllm_v_0_6_3/parallel_state.py +./verl/third_party/vllm/vllm_v_0_6_3/spmd_gpu_executor.py +./verl/third_party/vllm/vllm_v_0_6_3/tokenizer.py +./verl/third_party/vllm/vllm_v_0_6_3/worker.py +./verl/trainer/__init__.py +./verl/trainer/fsdp_sft_trainer.py +./verl/trainer/main_eval.py +./verl/trainer/main_generation.py +./verl/trainer/main_ppo.py +./verl/trainer/main_ppo_format.py +./verl/trainer/config/evaluation.yaml +./verl/trainer/config/generation.yaml +./verl/trainer/config/ppo_megatron_trainer.yaml +./verl/trainer/config/ppo_trainer.yaml +./verl/trainer/config/sft_trainer.yaml +./verl/trainer/ppo/__init__.py +./verl/trainer/ppo/core_algos.py +./verl/trainer/ppo/ray_trainer.py +./verl/utils/__init__.py +./verl/utils/config.py +./verl/utils/distributed.py +./verl/utils/flops_counter.py +./verl/utils/fs.py +./verl/utils/fsdp_utils.py +./verl/utils/hdfs_io.py +./verl/utils/import_utils.py +./verl/utils/logging_utils.py +./verl/utils/megatron_utils.py +./verl/utils/memory_buffer.py +./verl/utils/model.py +./verl/utils/py_functional.py +./verl/utils/ray_utils.py +./verl/utils/seqlen_balancing.py +./verl/utils/tokenizer.py +./verl/utils/torch_dtypes.py +./verl/utils/torch_functional.py +./verl/utils/tracking.py +./verl/utils/ulysses.py +./verl/utils/dataset/__init__.py +./verl/utils/dataset/rl_dataset.py +./verl/utils/dataset/rm_dataset.py +./verl/utils/debug/__init__.py +./verl/utils/debug/performance.py +./verl/utils/debug/trajectory_tracker.py +./verl/utils/logger/__init__.py +./verl/utils/logger/aggregate_logger.py +./verl/utils/megatron/__init__.py +./verl/utils/megatron/memory.py +./verl/utils/megatron/optimizer.py +./verl/utils/megatron/optimizer_config.py +./verl/utils/megatron/pipeline_parallel.py +./verl/utils/megatron/sequence_parallel.py +./verl/utils/megatron/tensor_parallel.py +./verl/utils/rendezvous/__init__.py +./verl/utils/rendezvous/ray_backend.py +./verl/utils/reward_score/__init__.py +./verl/utils/reward_score/countdown.py +./verl/utils/reward_score/gsm8k.py +./verl/utils/reward_score/math.py +./verl/utils/reward_score/multiply.py +./verl/utils/reward_score/qa_em.py +./verl/utils/reward_score/qa_em_format.py +./verl/version/version +./verl/workers/__init__.py +./verl/workers/fsdp_workers.py +./verl/workers/megatron_workers.py +./verl/workers/actor/__init__.py +./verl/workers/actor/base.py +./verl/workers/actor/dp_actor.py +./verl/workers/actor/megatron_actor.py +./verl/workers/critic/__init__.py +./verl/workers/critic/base.py +./verl/workers/critic/dp_critic.py +./verl/workers/critic/megatron_critic.py +./verl/workers/reward_model/__init__.py +./verl/workers/reward_model/base.py +./verl/workers/reward_model/megatron/__init__.py +./verl/workers/reward_model/megatron/reward_model.py +./verl/workers/rollout/__init__.py +./verl/workers/rollout/base.py +./verl/workers/rollout/hf_rollout.py +./verl/workers/rollout/tokenizer.py +./verl/workers/rollout/naive/__init__.py +./verl/workers/rollout/naive/naive_rollout.py +./verl/workers/rollout/vllm_rollout/__init__.py +./verl/workers/rollout/vllm_rollout/vllm_rollout.py +./verl/workers/sharding_manager/__init__.py +./verl/workers/sharding_manager/base.py +./verl/workers/sharding_manager/fsdp_ulysses.py +./verl/workers/sharding_manager/fsdp_vllm.py +./verl/workers/sharding_manager/megatron_vllm.py +verl.egg-info/PKG-INFO +verl.egg-info/SOURCES.txt +verl.egg-info/dependency_links.txt +verl.egg-info/requires.txt +verl.egg-info/top_level.txt +verl/version/version \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py b/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bd850b790c7ef7ea88515b58e629cad45c0c84e2 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__))) + +with open(os.path.join(version_folder, 'version/version')) as f: + __version__ = f.read().strip() diff --git a/code/RL_model/verl/Search-R1/verl/trainer/__init__.py b/code/RL_model/verl/Search-R1/verl/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/trainer/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py b/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..018bdd8fdbe01dddda5da009694246021320ab44 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py @@ -0,0 +1,69 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Offline evaluate the performance of a generated file using reward model and ground truth verifier. +The input is a parquet file that contains N generated sequences and (optional) the ground truth. + +""" + +import hydra +from verl.utils.fs import copy_local_path_from_hdfs +from verl.utils.reward_score import math, gsm8k +import pandas as pd +import numpy as np + + +def select_reward_fn(data_source): + if data_source == 'lighteval/MATH': + return math.compute_score + else: + raise NotImplementedError + + +@hydra.main(config_path='config', config_name='evaluation', version_base=None) +def main(config): + local_path = copy_local_path_from_hdfs(config.data.path) + dataset = pd.read_parquet(local_path) + prompts = dataset[config.data.prompt_key] + responses = dataset[config.data.response_key] + data_sources = dataset[config.data.data_source_key] + reward_model_data = dataset[config.data.reward_model_key] + + passes = 0 + + total = len(dataset) + + for i in range(total): + response_lst = responses[i] + data_source = data_sources[i] + # select reward score based on data_source + prompt = prompts[i] + reward_data = reward_model_data[i] + reward_fn = select_reward_fn(data_source) + ground_truth = reward_data['ground_truth'] + score_lst = [] + for r in response_lst: + score = reward_fn(r, ground_truth) + score_lst.append(score) + + max_score = np.max(score_lst) + + if max_score == 1: + passes += 1 + + print(f'pass@5: {passes / total}') + + +if __name__ == '__main__': + main() diff --git a/code/RL_model/verl/Search-R1/verl/utils/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e453070a16370cd7006e0a7700c8550a56f19051 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import tokenizer +from .tokenizer import * + +__all__ = tokenizer.__all__ \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/verl/utils/config.py b/code/RL_model/verl/Search-R1/verl/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..5c9298c42adf89467d047a3d0fdf8919bf772a5a --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/config.py @@ -0,0 +1,23 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from omegaconf import DictConfig + + +def update_dict_with_config(dictionary: Dict, config: DictConfig): + for key in dictionary: + if hasattr(config, key): + dictionary[key] = getattr(config, key) diff --git a/code/RL_model/verl/Search-R1/verl/utils/distributed.py b/code/RL_model/verl/Search-R1/verl/utils/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..6fea5a29cd943ef91c8f27f44db2a69e40702cf7 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/distributed.py @@ -0,0 +1,28 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for distributed training.""" +import os + + +def initialize_global_process_group(timeout_second=36000): + import torch.distributed + from datetime import timedelta + torch.distributed.init_process_group('nccl', timeout=timedelta(seconds=timeout_second)) + local_rank = int(os.environ["LOCAL_RANK"]) + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + + if torch.distributed.is_initialized(): + torch.cuda.set_device(local_rank) + return local_rank, rank, world_size diff --git a/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py b/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py new file mode 100644 index 0000000000000000000000000000000000000000..3c5ac1a91160fc3265589fb6e93e93c8c1efb53e --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py @@ -0,0 +1,123 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from transformers import PretrainedConfig, Qwen2Config, LlamaConfig + +VALID_CONFIG_TYPE = (Qwen2Config, LlamaConfig) + + +def get_device_flops(unit="T"): + + def unit_convert(number, level): + units = ["B", "K", "M", "G", "T", "P"] + if number <= 0: + return number + ptr = 0 + while ptr < len(units) and units[ptr] != level: + number /= 1000 + ptr += 1 + return number + + device_name = torch.cuda.get_device_name() + flops = float("inf") # INF flops for unkown gpu type + if "H100" in device_name or "H800" in device_name: + flops = 989e12 + elif "A100" in device_name or "A800" in device_name: + flops = 312e12 + elif "L40" in device_name: + flops = 181.05e12 + elif "L20" in device_name: + flops = 119.5e12 + elif "H20" in device_name: + flops = 148e12 + elif "910B" in device_name: + flops = 354e12 + flops_unit = unit_convert(flops, unit) + return flops_unit + + +class FlopsCounter: + """ + Used to count mfu during training loop + + Example: + flops_counter = FlopsCounter(config) + flops_achieved, flops_promised = flops_counter.estimate_flops(tokens_list, delta_time) + + """ + + def __init__(self, config: PretrainedConfig): + if not isinstance(config, VALID_CONFIG_TYPE): + print(f"Only support config type of {VALID_CONFIG_TYPE}, but got {type(config)}. " + f"MFU will always be zero.") + + self.estimate_func = {"qwen2": self._estimate_qwen2_flops, 'llama': self._estimate_qwen2_flops} + self.config = config + + def _estimate_unknown_flops(self, tokens_sum, batch_seqlens, delta_time): + return 0 + + def _estimate_qwen2_flops(self, tokens_sum, batch_seqlens, delta_time): + assert isinstance(self.config, (Qwen2Config, LlamaConfig)) + hidden_size = self.config.hidden_size + vocab_size = self.config.vocab_size + num_hidden_layers = self.config.num_hidden_layers + num_key_value_heads = self.config.num_key_value_heads + num_attention_heads = self.config.num_attention_heads + intermediate_size = self.config.intermediate_size + + head_dim = hidden_size // num_attention_heads + q_size = num_attention_heads * head_dim + k_size = num_key_value_heads * head_dim + v_size = num_key_value_heads * head_dim + + # non-attn per layer parm + # Qwen2/LLama use SwiGelu, gate, having up and down linear layer in mlp + mlp_N = hidden_size * intermediate_size * 3 + attn_linear_N = hidden_size * (q_size + k_size + v_size + num_attention_heads * head_dim) + emd_and_lm_head_N = vocab_size * hidden_size * 2 + # non-attn all_layer parm + dense_N = (mlp_N + attn_linear_N) * num_hidden_layers + emd_and_lm_head_N + # non-attn all_layer & all_token fwd & bwd flops + dense_N_flops = 6 * dense_N * tokens_sum + + # attn all_layer & all_token fwd & bwd flops + seqlen_square_sum = 0 + for seqlen in batch_seqlens: + seqlen_square_sum += seqlen * seqlen + attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_attention_heads * num_hidden_layers + + # all_layer & all_token fwd & bwd flops + flops_all_token = dense_N_flops + attn_qkv_flops + flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12 + return flops_achieved + + def estimate_flops(self, batch_seqlens, delta_time): + """ + Estimate the FLOPS based on the number of valid tokens in the current batch and the time taken. + + Args: + batch_seqlens (List[int]): A list where each element represents the number of valid tokens in the current batch. + delta_time (float): The time taken to process the batch, in seconds. + + Returns: + estimated_flops (float): The estimated FLOPS based on the input tokens and time. + promised_flops (float): The expected FLOPS of the current device. + """ + tokens_sum = sum(batch_seqlens) + func = self.estimate_func.get(self.config.model_type, self._estimate_unknown_flops) + estimated_flops = func(tokens_sum, batch_seqlens, delta_time) + promised_flops = get_device_flops() + return estimated_flops, promised_flops diff --git a/code/RL_model/verl/Search-R1/verl/utils/fs.py b/code/RL_model/verl/Search-R1/verl/utils/fs.py new file mode 100644 index 0000000000000000000000000000000000000000..80c1889be3582fffcdef5267f5e9ac55e1d7e059 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/fs.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- +"""File-system agnostic IO APIs""" +import os +import tempfile +import hashlib + +from .hdfs_io import copy, makedirs, exists + +__all__ = ["copy", "exists", "makedirs"] + +_HDFS_PREFIX = "hdfs://" + + +def _is_non_local(path): + return path.startswith(_HDFS_PREFIX) + + +def md5_encode(path: str) -> str: + return hashlib.md5(path.encode()).hexdigest() + + +def get_local_temp_path(hdfs_path: str, cache_dir: str) -> str: + """Return a local temp path that joins cache_dir and basename of hdfs_path + + Args: + hdfs_path: + cache_dir: + + Returns: + + """ + # make a base64 encoding of hdfs_path to avoid directory conflict + encoded_hdfs_path = md5_encode(hdfs_path) + temp_dir = os.path.join(cache_dir, encoded_hdfs_path) + os.makedirs(temp_dir, exist_ok=True) + dst = os.path.join(temp_dir, os.path.basename(hdfs_path)) + return dst + + +def copy_local_path_from_hdfs(src: str, cache_dir=None, filelock='.file.lock', verbose=False) -> str: + """Copy src from hdfs to local if src is on hdfs or directly return src. + If cache_dir is None, we will use the default cache dir of the system. Note that this may cause conflicts if + the src name is the same between calls + + Args: + src (str): a HDFS path of a local path + + Returns: + a local path of the copied file + """ + from filelock import FileLock + + assert src[-1] != '/', f'Make sure the last char in src is not / because it will cause error. Got {src}' + + if _is_non_local(src): + # download from hdfs to local + if cache_dir is None: + # get a temp folder + cache_dir = tempfile.gettempdir() + os.makedirs(cache_dir, exist_ok=True) + assert os.path.exists(cache_dir) + local_path = get_local_temp_path(src, cache_dir) + # get a specific lock + filelock = md5_encode(src) + '.lock' + lock_file = os.path.join(cache_dir, filelock) + with FileLock(lock_file=lock_file): + if not os.path.exists(local_path): + if verbose: + print(f'Copy from {src} to {local_path}') + copy(src, local_path) + return local_path + else: + return src diff --git a/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py b/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d0243cd15c2d2defe8e54164c6e07a05c5f6232d --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py @@ -0,0 +1,329 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict +import functools +import json +import math +import itertools +import os +from contextlib import contextmanager +from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy +from transformers.trainer_pt_utils import get_module_class_from_name +import torch +import torch.nn as nn +import torch.distributed as dist + + +def init_fn(x: torch.nn.Module): + if not torch.distributed.get_rank() == 0: + x = x.to_empty(device=torch.cuda.current_device(), recurse=False) + torch.cuda.empty_cache() + return x + + +def get_init_weight_context_manager(use_meta_tensor=True): + from accelerate import init_empty_weights + cpu_init_weights = lambda: torch.device('cpu') + if use_meta_tensor: + init_context = init_empty_weights if torch.distributed.get_rank() != 0 else cpu_init_weights + else: + init_context = cpu_init_weights + return init_context + + +# Copyright 2020-present the HuggingFace Inc. team. +# Adapted from https://github.com/huggingface/transformers/src/transformers/trainer.py +def get_fsdp_wrap_policy(module, config=None, is_lora=False): + """Get FSDP wrap policy for the module. + + Args: + module: The module to get wrap policy for + config: Configuration for wrap policy + is_lora: Whether to enable lambda policy for LoRA modules + """ + if config is None: + config = {} + + if config.get('disable', False): + return None + + default_transformer_cls_names_to_wrap = getattr(module, "_no_split_modules", None) + fsdp_transformer_layer_cls_to_wrap = config.get("transformer_layer_cls_to_wrap", + default_transformer_cls_names_to_wrap) + min_num_params = config.get('min_num_params', 0) + auto_wrap_policy = None + + policies = [] + + from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy + + # Add lambda policy for LoRA modules if is_lora is True + if is_lora: + + def lambda_policy_fn(module): + if (len(list(module.named_children())) == 0 and getattr(module, "weight", None) is not None and + module.weight.requires_grad): + return True + return False + + lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn) + policies.append(lambda_policy) + + if min_num_params > 0: + size_policy = functools.partial(size_based_auto_wrap_policy, min_num_params=min_num_params) + policies.append(size_policy) + elif fsdp_transformer_layer_cls_to_wrap is not None: + transformer_cls_to_wrap = set() + for layer_class in fsdp_transformer_layer_cls_to_wrap: + transformer_cls = get_module_class_from_name(module, layer_class) + if transformer_cls is None: + raise Exception("Could not find the transformer layer class to wrap in the model.") + else: + transformer_cls_to_wrap.add(transformer_cls) + + transformer_policy = functools.partial( + transformer_auto_wrap_policy, + transformer_layer_cls=transformer_cls_to_wrap, + ) + policies.append(transformer_policy) + + if len(policies) > 0: + auto_wrap_policy = functools.partial(_or_policy, policies=policies) + + return auto_wrap_policy + + +def offload_fsdp_grad(module): + for _, param in module.named_parameters(): + if param.grad is not None: + param.grad = param.grad.to("cpu", non_blocking=True) + torch.cuda.empty_cache() + + +def load_fsdp_grad(module, device_id): + for _, param in module.named_parameters(): + if param.grad is not None: + param.grad = param.grad.to(device_id, non_blocking=True) + torch.cuda.empty_cache() + + +def offload_fsdp_param_and_grad(module, offload_grad=False): + for _, param in module.named_parameters(): + if hasattr(param, "_local_shard"): + param._local_shard = param._local_shard.to("cpu", non_blocking=True) + param.data = param.data.to('cpu', non_blocking=True) + if offload_grad and param.grad is not None: + param.grad = param.grad.to("cpu", non_blocking=True) + torch.cuda.empty_cache() + + +def load_fsdp_param_and_grad(module, device_id, load_grad=False): + for _, param in module.named_parameters(): + if hasattr(param, "_local_shard"): + param._local_shard = param._local_shard.to(device_id, non_blocking=True) + param.data = param.data.to(device_id, non_blocking=True) + if load_grad and param.grad is not None: + param.grad = param.grad.to(device_id, non_blocking=True) + torch.cuda.empty_cache() + + +def offload_fsdp_optimizer(optimizer): + for param_group in optimizer.param_groups: + for param in param_group['params']: + state = optimizer.state[param] + for key, value in state.items(): + if isinstance(value, torch.Tensor): + state[key] = value.to("cpu", non_blocking=True) + torch.cuda.empty_cache() + + +def load_fsdp_optimizer(optimizer, device_id): + for param_group in optimizer.param_groups: + for param in param_group['params']: + state = optimizer.state[param] + for key, value in state.items(): + if isinstance(value, torch.Tensor): + state[key] = value.to(device_id, non_blocking=True) + torch.cuda.empty_cache() + + +@contextmanager +def meta_device_init(): + """ + Create model parameters with meta device. + + Note buffers in model will still be initialized in default device (e.g., CPU), + since the buffers can be non-persistent and filled with expected values that can + NOT be captured in meta device. + """ + device = torch.device("meta") + old_register_parameter = nn.Module.register_parameter + registered = set() + + def register_empty_parameter(module, name, param): + old_register_parameter(module, name, param) + # we will skip register shared parameters as it + # is already registered previously + if param is not None and param not in registered: + param_cls = type(module._parameters[name]) + kwargs = module._parameters[name].__dict__ + kwargs["requires_grad"] = param.requires_grad + module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs) + registered.add(module._parameters[name]) + + try: + nn.Module.register_parameter = register_empty_parameter + yield + finally: + registered.clear() + nn.Module.register_parameter = old_register_parameter + + +def parallel_load_safetensors(filepath): + """ + Parallel load safetensors from huggingface checkpoint + + Huggingface checkpoint contains: + + - config.json: a json file for model configuration + - model.safetensor.index.json: a json file for safetensors (parameters & buffers) index + - model-000x-of-ooxx.safetensors: a binary file for safetensors (parameters & buffers) chunks + + Or (when model is small), + + - model.safetensors: a binary file for all parameters and buffers + + Each rank will own a part of model chunks and load them directly into GPU memory. + """ + from safetensors.torch import load_file + + safetensors2param = {} + + index_file = os.path.join(filepath, "model.safetensors.index.json") + if os.path.exists(index_file): + index = json.load(open(index_file, "rb")) + for param_name, filename in index["weight_map"].items(): + safetensors2param.setdefault(filename, []).append(param_name) + else: + # in this case, the model is small and we can load it all at once + param_file = os.path.join(filepath, "model.safetensors") + assert os.path.exists(param_file), f"Cannot find {param_file}" + states = load_file(param_file) + for param_name in states: + safetensors2param.setdefault("model.safetensors", []).append(param_name) + del states + + total_files = len(safetensors2param) + ckpt_chunks = sorted(safetensors2param.keys()) + world_size = dist.get_world_size() + size = int(math.ceil(total_files / world_size)) + ckpt_chunks = [ckpt_chunks[rank * size:rank * size + size] for rank in range(world_size)] + + shard_states = {} + device = torch.cuda.current_device() + for rank, files in enumerate(ckpt_chunks): + if rank == dist.get_rank(): + for file in files: + file = os.path.join(filepath, file) + states = load_file(file, device=device) + # print(f"rank {rank} loading {file}...") + shard_states.update(states) + else: + for file in files: + for param_name in safetensors2param[file]: + shard_states[param_name] = rank + return shard_states + + +def parallel_init_module_fn(module: torch.nn.Module, shard_states: Dict[str, torch.nn.Parameter]): + """ + Generate a function to initialize sub-modules in the `module` with `shard_states` + from huggingface checkpoint. + + Args: + module (torch.nn.Module): the global module to be initialized + shard_states (Dict[str, torch.nn.Parameter]): the shard states from huggingface checkpoint + + Returns: + init_fn (Callable): a function to initialize sub-modules in the `module` with `shard_states` + """ + + state2fqn = {} + for name, state in itertools.chain(module.named_parameters(remove_duplicate=False), + module.named_buffers(remove_duplicate=False)): + state2fqn.setdefault(state, []).append(name) + # remove standalone parameters and buffers + shared = {s for s, names in state2fqn.items() if len(names) > 1} + materialized_states = {} + + @torch.no_grad() + def create_and_sync_state(param_name, state, is_param): + assert param_name in shard_states, f"{param_name} not loaded" + device = torch.cuda.current_device() + if is_param: + param = torch.nn.Parameter(torch.empty_like(state.data, device=device), requires_grad=state.requires_grad) + else: # buffer + param = torch.empty_like(state.data, device=device) + loaded = shard_states[param_name] + if isinstance(loaded, (torch.nn.Parameter, torch.Tensor)): + # NOTE: loaded.dtype can be different with param.dtype + param.data.copy_(loaded.data) + dist.broadcast(param.data, src=dist.get_rank()) + else: + assert isinstance(loaded, int) # the rank that holds the state + dist.broadcast(param.data, src=loaded) + shard_states.pop(param_name) + del loaded + return param + + def init_fn(sub_mod: torch.nn.Module, recurse: bool = True): + param_and_buffers = tuple(sub_mod.named_parameters(recurse=False)) + tuple(sub_mod.named_buffers(recurse=False)) + # param_and_buffers = sorted(sub_mod.named_parameters(recurse=False), key=lambda x: x[0]) + for name, state in param_and_buffers: + if not state.is_meta: + continue + is_param = name in sub_mod._parameters + fqn = state2fqn[state].pop(0) + # non-persistent buffers will not be saved in state dict, we can safely skip it + if (not is_param) and fqn not in shard_states: + if state.is_meta: + raise RuntimeError( + f"find a non-persistent buffer ({fqn}) initiated with device meta. " + "Such buffer is not saved in checkpoint and user should guarantee to init in CPU / GPU device.") + continue + # for shared parameter, we get it from the first time it is created + if state in shared: + if state not in materialized_states: + materialized_states[state] = create_and_sync_state(fqn, state, is_param) + else: + if fqn in shard_states: + shard_states.pop(fqn) + materialize_state = materialized_states[state] + # for not shared parameter, we create it directly + else: + materialize_state = create_and_sync_state(fqn, state, is_param) + if is_param: + sub_mod._parameters[name] = materialize_state + else: + sub_mod._buffers[name] = materialize_state + if recurse: + for module in sub_mod.children(): + init_fn(module, recurse=True) + + # for debug + # if len(shard_states) == 0: print("clear") + return sub_mod + + return init_fn \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py b/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py new file mode 100644 index 0000000000000000000000000000000000000000..08c4ecb9a5956865ce35651d6eaaf6844ba87f41 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py @@ -0,0 +1,144 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import logging + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv('VERL_SFT_LOGGING_LEVEL', 'WARN')) + +_HDFS_PREFIX = "hdfs://" + +_HDFS_BIN_PATH = shutil.which('hdfs') + + +def exists(path: str, **kwargs) -> bool: + r"""Works like os.path.exists() but supports hdfs. + + Test whether a path exists. Returns False for broken symbolic links. + + Args: + path (str): path to test + + Returns: + bool: True if the path exists, False otherwise + """ + if _is_non_local(path): + return _exists(path, **kwargs) + return os.path.exists(path) + + +def _exists(file_path: str): + """ hdfs capable to check whether a file_path is exists """ + if file_path.startswith("hdfs"): + return _run_cmd(_hdfs_cmd(f"-test -e {file_path}")) == 0 + return os.path.exists(file_path) + + +def makedirs(name, mode=0o777, exist_ok=False, **kwargs) -> None: + r"""Works like os.makedirs() but supports hdfs. + + Super-mkdir; create a leaf directory and all intermediate ones. Works like + mkdir, except that any intermediate path segment (not just the rightmost) + will be created if it does not exist. If the target directory already + exists, raise an OSError if exist_ok is False. Otherwise no exception is + raised. This is recursive. + + Args: + name (str): directory to create + mode (int): file mode bits + exist_ok (bool): if True, do not raise an exception if the directory already exists + kwargs: keyword arguments for hdfs + + """ + if _is_non_local(name): + # TODO(haibin.lin): + # - handle OSError for hdfs(?) + # - support exist_ok for hdfs(?) + _mkdir(name, **kwargs) + else: + os.makedirs(name, mode=mode, exist_ok=exist_ok) + + +def _mkdir(file_path: str) -> bool: + """hdfs mkdir""" + if file_path.startswith("hdfs"): + _run_cmd(_hdfs_cmd(f"-mkdir -p {file_path}")) + else: + os.makedirs(file_path, exist_ok=True) + return True + + +def copy(src: str, dst: str, **kwargs) -> bool: + r"""Works like shutil.copy() for file, and shutil.copytree for dir, and supports hdfs. + + Copy data and mode bits ("cp src dst"). Return the file's destination. + The destination may be a directory. + If source and destination are the same file, a SameFileError will be + raised. + + Arg: + src (str): source file path + dst (str): destination file path + kwargs: keyword arguments for hdfs copy + + Returns: + str: destination file path + + """ + if _is_non_local(src) or _is_non_local(dst): + # TODO(haibin.lin): + # - handle SameFileError for hdfs files(?) + # - return file destination for hdfs files + return _copy(src, dst) + else: + if os.path.isdir(src): + return shutil.copytree(src, dst, **kwargs) + else: + return shutil.copy(src, dst, **kwargs) + + +def _copy(from_path: str, to_path: str, timeout: int = None) -> bool: + if to_path.startswith("hdfs"): + if from_path.startswith("hdfs"): + returncode = _run_cmd(_hdfs_cmd(f"-cp -f {from_path} {to_path}"), timeout=timeout) + else: + returncode = _run_cmd(_hdfs_cmd(f"-put -f {from_path} {to_path}"), timeout=timeout) + else: + if from_path.startswith("hdfs"): + returncode = _run_cmd(_hdfs_cmd(f"-get \ + {from_path} {to_path}"), timeout=timeout) + else: + try: + shutil.copy(from_path, to_path) + returncode = 0 + except shutil.SameFileError: + returncode = 0 + except Exception as e: + logger.warning(f"copy {from_path} {to_path} failed: {e}") + returncode = -1 + return returncode == 0 + + +def _run_cmd(cmd: str, timeout=None): + return os.system(cmd) + + +def _hdfs_cmd(cmd: str) -> str: + return f"{_HDFS_BIN_PATH} dfs {cmd}" + + +def _is_non_local(path: str): + return path.startswith(_HDFS_PREFIX) diff --git a/code/RL_model/verl/Search-R1/verl/utils/import_utils.py b/code/RL_model/verl/Search-R1/verl/utils/import_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e5690512d144a30d2a1f0bd128a40eb8876936b7 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/import_utils.py @@ -0,0 +1,48 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities to check if packages are available. +We assume package availability won't change during runtime. +""" + +from functools import cache +from typing import List + + +@cache +def is_megatron_core_available(): + try: + from megatron.core import parallel_state as mpu + return True + except ImportError: + return False + + +@cache +def is_vllm_available(): + try: + import vllm + return True + except ImportError: + return False + + +def import_external_libs(external_libs=None): + if external_libs is None: + return + if not isinstance(external_libs, List): + external_libs = [external_libs] + import importlib + for external_lib in external_libs: + importlib.import_module(external_lib) diff --git a/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py b/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf6e1f0fa70784edb6a7e6efecdba07f0c399b3 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py @@ -0,0 +1,22 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + + +def set_basic_config(level): + """ + This function sets the global logging format and level. It will be called when import verl + """ + logging.basicConfig(format='%(levelname)s:%(asctime)s:%(message)s', level=level) diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py b/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fcb6b65a79ea302e3f7eaccd5145e29adbb9edd6 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py @@ -0,0 +1,253 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Pretrain utilities.""" +from typing import Any, Dict +import time +from omegaconf import DictConfig +from verl.utils.torch_dtypes import PrecisionType +from verl.utils.memory_buffer import build_memory_reference_from_module +import torch +import torch.nn as nn +import torch.nn.functional as F + +from megatron.core import mpu, tensor_parallel +from megatron.core.utils import get_model_config +from megatron.core.transformer import TransformerConfig +from megatron.core.transformer.module import Float16Module +# from megatron.core.distributed import DistributedDataParallelConfig +from megatron.core.distributed import DistributedDataParallel as DDP +from megatron.core.enums import ModelType + + +def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True): + """Build the model.""" + # Build model. + if mpu.get_pipeline_model_parallel_world_size() > 1 and \ + mpu.get_virtual_pipeline_model_parallel_world_size() is not None: + assert model_type != ModelType.encoder_and_decoder, \ + "Interleaved schedule not supported for model with both encoder and decoder" + model = [] + for i in range(mpu.get_virtual_pipeline_model_parallel_world_size()): + mpu.set_virtual_pipeline_model_parallel_rank(i) + # Set pre_process and post_process only after virtual rank is set. + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + this_model = model_provider_func(pre_process=pre_process, post_process=post_process) + this_model.model_type = model_type + model.append(this_model) + else: + pre_process = mpu.is_pipeline_first_stage() + post_process = mpu.is_pipeline_last_stage() + add_encoder = True + add_decoder = True + if model_type == ModelType.encoder_and_decoder: + if mpu.get_pipeline_model_parallel_world_size() > 1: + assert mpu.get_pipeline_model_parallel_split_rank() is not None, \ + "Split rank needs to be specified for model with both encoder and decoder" + rank = mpu.get_pipeline_model_parallel_rank() + split_rank = mpu.get_pipeline_model_parallel_split_rank() + world_size = mpu.get_pipeline_model_parallel_world_size() + pre_process = rank == 0 or rank == split_rank + post_process = (rank == (split_rank - 1)) or (rank == (world_size - 1)) + add_encoder = mpu.is_pipeline_stage_before_split() + add_decoder = mpu.is_pipeline_stage_after_split() + model = model_provider_func(pre_process=pre_process, + post_process=post_process, + add_encoder=add_encoder, + add_decoder=add_decoder) + else: + model = model_provider_func(pre_process=pre_process, post_process=post_process) + model.model_type = model_type + + if not isinstance(model, list): + model = [model] + + # Set tensor model parallel attributes if not set. + # Only parameters that are already tensor model parallel have these + # attributes set for them. We should make sure the default attributes + # are set for all params so the optimizer can use them. + for model_module in model: + for param in model_module.parameters(): + tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param) + + # Print number of parameters. + if mpu.get_data_parallel_rank() == 0: + print(' > number of parameters on (tensor, pipeline) ' + 'model parallel rank ({}, {}): {}'.format( + mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), + sum([sum([p.nelement() for p in model_module.parameters()]) for model_module in model])), + flush=True) + + # GPU allocation. + for model_module in model: + model_module.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + config = get_model_config(model[0]) + if config.fp16 or config.bf16: # the ModelParallelConfig in GPTModel + model = [Float16Module(config, model_module) for model_module in model] + + if wrap_with_ddp: + model = [ + DDP(config=config, + module=model_chunk, + data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True), + accumulate_allreduce_grads_in_fp32=True, + overlap_grad_reduce=False, + use_distributed_optimizer=True, + disable_bucketing=(model_chunk_idx > 0)) for (model_chunk_idx, model_chunk) in enumerate(model) + ] + # # Broadcast params from data parallel src rank to other data parallel ranks. + # if args.data_parallel_random_init: + for model_module in model: + model_module.broadcast_params() + return model + + +ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module) + + +def unwrap_model(model, module_instances=ALL_MODULE_WRAPPER_CLASSNAMES): + return_list = True + if not isinstance(model, list): + model = [model] + return_list = False + unwrapped_model = [] + for model_module in model: + while isinstance(model_module, module_instances): + model_module = model_module.module + unwrapped_model.append(model_module) + if not return_list: + return unwrapped_model[0] + return unwrapped_model + + +from transformers import PretrainedConfig + + +def convert_config(hf_config: PretrainedConfig, megatron_config) -> TransformerConfig: + print(f'megatron config {megatron_config}') + dt = PrecisionType.to_dtype(megatron_config['param_dtype']) + print(f'pipeline_dtype=megatron_config {dt}') + transformer_config = TransformerConfig( + num_layers=hf_config.num_hidden_layers, + hidden_size=hf_config.hidden_size, + num_attention_heads=hf_config.num_attention_heads, + num_query_groups=hf_config.num_key_value_heads, + ffn_hidden_size=hf_config.intermediate_size, + # max_position_embeddings=hf_config.max_position_embeddings, + activation_func=F.silu, + normalization='RMSNorm', + # rotary_percent=False, # default, + gated_linear_unit=True, # for llama + use_cpu_initialization=True, + apply_residual_connection_post_layernorm=False, # check what's this mean + add_bias_linear=False, + tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(), + pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(), + virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(), + pipeline_dtype=PrecisionType.to_dtype(megatron_config['param_dtype']), + params_dtype=PrecisionType.to_dtype(megatron_config['param_dtype']), + sequence_parallel=megatron_config['sequence_parallel_enabled'], + variable_seq_lengths=True, + masked_softmax_fusion=True, + bf16=PrecisionType.to_dtype(megatron_config['param_dtype']) is torch.bfloat16) + if torch.distributed.get_rank() == 0: + print(f'tensor_parallel_size={transformer_config.tensor_model_parallel_size} \n \ + pipeline_model_parallel_size={transformer_config.pipeline_model_parallel_size} \n \ + virtual_pipeline_model_parallel_size={transformer_config.virtual_pipeline_model_parallel_size} \n \ + pipeline_dtype={transformer_config.pipeline_dtype} \n \ + params_dtype={transformer_config.params_dtype} \n \ + sequence_parallel={transformer_config.sequence_parallel} \n \ + variable_seq_lengths={transformer_config.variable_seq_lengths} \n \ + masked_softmax_fusion={transformer_config.masked_softmax_fusion} \n ') + + return transformer_config + + +# from megatron.core.optimizer import OptimizerConfig + +from verl.utils.megatron.optimizer_config import OptimizerConfig + + +def init_megatron_optim_config(optim_config: Dict) -> OptimizerConfig: + config = OptimizerConfig( + optimizer='adam', + lr=optim_config.get('lr'), + clip_grad=optim_config.get('clip_grad'), + weight_decay=1e-2, + bf16=True, + params_dtype=torch.bfloat16, + use_distributed_optimizer=True, + ) + return config + + +from megatron.core import ModelParallelConfig + + +def init_model_parallel_config(config: DictConfig) -> ModelParallelConfig: + # TODO(sgm): check how to disable megatron timers + timers = FakeTimers() + return ModelParallelConfig(tensor_model_parallel_size=config.get('tensor_model_parallel_size'), + pipeline_model_parallel_size=config.get('pipeline_model_parallel_size'), + virtual_pipeline_model_parallel_size=config.get('virtual_pipeline_model_parallel_size'), + sequence_parallel=config.get('sequence_parallel'), + params_dtype=PrecisionType.to_dtype(config.get('param_dtype')), + pipeline_dtype=PrecisionType.to_dtype(config.get('param_dtype')), + bf16=True, + fp16=False, + timers=timers) + + +class FakeTimers: + """Disable All Megatron Timing with FakeTimers""" + + def __init__(self): + from megatron.timers import DummyTimer + self.dummy_timer = DummyTimer() + + def __call__(self, *args: Any, **kwds: Any) -> Any: + return self.dummy_timer + + +def offload_megatron_param_and_grad(module_list: nn.ModuleList, offload_grad=False, hybrid_engine=None): + if hybrid_engine is not None: + pp_rank = mpu.get_pipeline_model_parallel_rank() + for buffer in hybrid_engine.memory_buffers[pp_rank].values(): + buffer.data = buffer.data.to('cpu', non_blocking=True) + build_memory_reference_from_module(module_list, hybrid_engine.memory_buffers[pp_rank], maintain_weight=True) + else: + for module in module_list: + for _, param in module.named_parameters(): + param.data = param.data.to('cpu', non_blocking=True) + if offload_grad and param.grad is not None: + param.grad = param.grad.to("cpu", non_blocking=True) + torch.cuda.empty_cache() + + +def load_megatron_param_and_grad(module_list: nn.ModuleList, device_id, load_grad=False, hybrid_engine=None): + if hybrid_engine is not None: + pp_rank = mpu.get_pipeline_model_parallel_rank() + for buffer in hybrid_engine.memory_buffers[pp_rank].values(): + buffer.data = buffer.data.to(device_id, non_blocking=True) + build_memory_reference_from_module(module_list, hybrid_engine.memory_buffers[pp_rank], maintain_weight=True) + else: + for module in module_list: + for _, param in module.named_parameters(): + param.data = param.data.to(device_id, non_blocking=True) + if load_grad and param.grad is not None: + param.grad = param.grad.to(device_id, non_blocking=True) + torch.cuda.empty_cache() diff --git a/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py b/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..2e07e42f7bc4648d3376dba404ae122e07ccb0d0 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py @@ -0,0 +1,214 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file contains utilities to manipulate torch memory buffers +""" + +from typing import Dict, List + +import torch +from torch import nn + + +class MemoryBuffer: + """ + A memory buffer is a contiguous torch tensor that may combine multiple tensors sharing with the underlying + memory. It must have a unique type to support this behavior. + """ + + def __init__(self, numel: int, numel_padded: int, dtype: torch.dtype): + self.numel = numel + self.numel_padded = numel_padded + self.dtype = dtype + self.data = torch.zeros(self.numel_padded, dtype=self.dtype, device='cuda', requires_grad=False) + + def zero(self): + """Reset the buffer to zero.""" + self.data.zero_() + + def get(self, shape, start_index): + """Return a tensor with the input `shape` as a view into the + 1-D data starting at `start_index`.""" + end_index = start_index + shape.numel() + assert end_index <= self.numel, \ + 'requested tensor is out of the buffer range.' + buffer_tensor = self.data[start_index:end_index] + buffer_tensor = buffer_tensor.view(shape) + return buffer_tensor + + +def calc_padded_numel(shape: torch.Size, dtype: torch.dtype): + """for cuda memory alignment, make sure alignment by 128-bits""" + align_numel = 128 // torch.finfo(dtype).bits + numel = shape.numel() + return (numel + align_numel - 1) // align_numel * align_numel + + +def get_weight_buffer_meta_from_module(module: nn.Module) -> Dict[str, Dict]: + """ + Return a dictionary containing name to a shape and dtype. + """ + weight_buffer_meta = {} + for name, param in sorted(module.named_parameters()): + weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype} + return weight_buffer_meta + + +def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]: + """Build the memory buffer given weight_buffer_meta + + Args: + weight_buffer_meta: contains mapping from name to a dictionary containing shape and dtype of the tensors + + Returns: a large memory buffer for each dtype that can hold all the tensors + + """ + memory_buffers = {} + total_numel_map = {} # map from dtype to the total numel + for name, meta_info in sorted(weight_buffer_meta.items()): + shape = meta_info['shape'] + dtype = meta_info['dtype'] + + assert isinstance(shape, torch.Size) + assert isinstance(dtype, torch.dtype) + + if dtype not in total_numel_map: + total_numel_map[dtype] = 0 + + total_numel_map[dtype] += calc_padded_numel(shape, dtype) + + for dtype, total_numel in total_numel_map.items(): + memory_buffers[dtype] = MemoryBuffer(total_numel, total_numel, dtype) + + return memory_buffers + + +def build_memory_reference_from_module(module: torch.nn.Module, + memory_buffers: Dict[torch.dtype, MemoryBuffer], + maintain_weight=True): + start_index = {} + for dtype in memory_buffers.keys(): + start_index[dtype] = 0 + for name, param in sorted(module.named_parameters()): + memory_buffer = memory_buffers[param.dtype] + buffer = memory_buffer.get(shape=param.shape, start_index=start_index[param.dtype]) + # need to increment start_index + start_index[param.dtype] += calc_padded_numel(param.shape, dtype) + if maintain_weight: + buffer.copy_(param.data) + param.data = buffer + + +def build_memory_reference(weight_buffer_meta: Dict[str, Dict], memory_buffers: Dict[torch.dtype, MemoryBuffer]): + """Build the memory references. The memory buffers are built using the build_memory_buffer API. + This API will allocate a weight buffer pointer to the memory buffer according to the weight_buffer_meta. + + Args: + weight_buffer_meta: + memory_buffers: + + Returns: + + """ + start_idx = {} + weight_buffers = {} + for dtype in memory_buffers.keys(): + start_idx[dtype] = 0 + + for name, meta_info in sorted(weight_buffer_meta.items()): + shape = meta_info['shape'] + dtype = meta_info['dtype'] + + buffer = memory_buffers[dtype].get(shape, start_index=start_idx[dtype]) + start_idx[dtype] += calc_padded_numel(shape, dtype) + weight_buffers[name] = buffer + + return weight_buffers + + +class MemoryBufferModuleWrapper: + """ + Note that we do not design MemoryBufferModuleWrapper as an nn.Module due to + - It will change the checkpoint name + """ + + def __init__(self, module: nn.Module): + super().__init__() + self.module = module + self.weight_buffer_meta = get_weight_buffer_meta_from_module(self.module) + self.memory_buffers = build_memory_buffer(self.weight_buffer_meta) + build_memory_reference_from_module(self.module, self.memory_buffers) + + def get_memory_buffers(self): + return self.memory_buffers + + def get_weight_buffer_meta(self): + return self.weight_buffer_meta + + +class MegatronMemoryBufferForRollout(object): + """ + We assume that + - inference engine has tp + dp + - actor has tp + pp + dp + - the tp between inference engine and actor should be the same + - memory_buffers: contains a list of memory_buffers, each is a dict from dtype to MemoryBuffer + - weight_buffers: contains a list of weight_buffers, each is a dict from name to param + - named_parameters: a dict from name to parameter that normalizes the names from pp and vpp. Note that + the named_parameters may not be directly compatible with inference engine. User has to take care of + this part such as the layout mismatches. (e.g. qkv transpose) + - Note that weight_buffer, named_parameters and memory_buffers share the same underlying GPU memory. + - When doing weight sync, the data is transfer via memory buffers + """ + + def __init__(self, transform_memory_param_fn): + self._memory_buffers = [] + self._weight_buffers = [] + self._named_parameters = {} + self.transform_memory_param_fn = transform_memory_param_fn + + def initialize_weight_buffer(self, weight_buffer_meta_pp: List[Dict[str, Dict]]): + """ + Initialize the weight buffer. The weight buffer is obtained according to the actor. We will construct + a large buffer for each dtype in the weight_buffer. + + Args: + weight_buffer_meta: contains pp models, each pp models contains a dictionary of mapping from + + Returns: None + + """ + self.weight_buffer_meta_pp = weight_buffer_meta_pp + + for weight_buffer_meta in self.weight_buffer_meta_pp: + memory_buffer = build_memory_buffer(weight_buffer_meta) + self._memory_buffers.append(memory_buffer) + self._weight_buffers.append(None) + + def build_memory_reference(self): + for i, weight_buffer_meta in enumerate(self.weight_buffer_meta_pp): + self._weight_buffers[i] = build_memory_reference(weight_buffer_meta, self._memory_buffers[i]) + self._named_parameters = self.transform_memory_param_fn(self._weight_buffers) + + @property + def named_parameters(self): + return self._named_parameters + + @property + def weight_buffers(self): + return self._weight_buffers + + @property + def memory_buffers(self): + return self._memory_buffers diff --git a/code/RL_model/verl/Search-R1/verl/utils/model.py b/code/RL_model/verl/Search-R1/verl/utils/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9002451a1dce34b8c844f907ee6ac487351b5314 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/model.py @@ -0,0 +1,332 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities to create common models from huggingface +""" +import os +import warnings +from typing import Dict, Type + +import numpy as np +import torch +from torch import nn +from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, MistralForSequenceClassification +from verl.models.registry import ModelRegistry + + +class LambdaLayer(nn.Module): + + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, *args, **kwargs): + return self.fn(*args, **kwargs) + + +def squeeze(x): + return torch.squeeze(x, dim=-1) + + +def update_model_config(module_config, override_config_kwargs): + for key, val in override_config_kwargs.items(): + setattr(module_config, key, val) + + +def get_huggingface_actor_config(model_name: str, override_config_kwargs=None, trust_remote_code=False) -> Dict: + if override_config_kwargs is None: + override_config_kwargs = {} + assert isinstance(override_config_kwargs, Dict), \ + f'override_config_kwargs must be a dict, got {type(override_config_kwargs)}' + module_config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) + update_model_config(module_config, override_config_kwargs) + + return module_config + + +def create_huggingface_actor(model_name: str, override_config_kwargs=None, automodel_kwargs=None) -> nn.Module: + """ + + Args: + model_name: + actor_override_config_kwargs: + + Returns: + + """ + if override_config_kwargs is None: + override_config_kwargs = {} + if automodel_kwargs is None: + automodel_kwargs = {} + assert isinstance(override_config_kwargs, Dict), \ + f'override_config_kwargs must be a dict, got {type(override_config_kwargs)}' + module_config = get_huggingface_actor_config(model_name, + override_config_kwargs, + trust_remote_code=automodel_kwargs.get('trust_remote_code', False)) + module: nn.Module = AutoModelForCausalLM.from_config(module_config, **automodel_kwargs) + return module + + +def create_huggingface_critic(model_name: str, override_config_kwargs=None, automodel_kwargs=None) -> nn.Module: + """ + + Args: + model_name: + override_config_kwargs: + + Returns: + + """ + critic_module: nn.Module = create_huggingface_actor(model_name, + override_config_kwargs=override_config_kwargs, + automodel_kwargs=automodel_kwargs) + if automodel_kwargs is None: + automodel_kwargs = {} + torch_dtype = automodel_kwargs.get('torch_dtype', torch.float32) + critic_module.lm_head = nn.Sequential(nn.Linear(critic_module.config.hidden_size, 1, dtype=torch_dtype), + LambdaLayer(fn=squeeze)) + return critic_module + + +def get_model_size(model: nn.Module, scale='auto'): + n_params = sum(p.numel() for p in model.parameters()) + + if scale == 'auto': + if n_params > 1e9: + scale = 'B' + elif n_params > 1e6: + scale = 'M' + elif n_params > 1e3: + scale = 'K' + else: + scale = '' + + if scale == 'B': + n_params = n_params / 1e9 + elif scale == 'M': + n_params = n_params / 1e6 + elif scale == 'K': + n_params = n_params / 1e3 + elif scale == '': + pass + else: + raise NotImplemented(f'Unknown scale {scale}') + + return n_params, scale + + +def print_model_size(model: nn.Module, name: str = None): + n_params, scale = get_model_size(model, scale='auto') + if name is None: + name = model.__class__.__name__ + print(f'{name} contains {n_params:.2f}{scale} parameters') + + +def create_random_mask(input_ids: torch.Tensor, + max_ratio_of_valid_token: float, + max_ratio_of_left_padding: float, + min_ratio_of_valid_token: float = 0): + """Create a random mask given input_ids. Support left padding and right padding. + Process: + - Sample valid token length + - Sample left_padding length + - Generate padding + + Args: + input_ids: + shape (batch_size, seq_len) + + Returns: + + """ + assert max_ratio_of_valid_token > 0 and max_ratio_of_valid_token <= 1. + assert max_ratio_of_left_padding >= 0 and max_ratio_of_left_padding < 1. + assert min_ratio_of_valid_token <= max_ratio_of_valid_token + + batch_size, sequence_length = input_ids.shape + max_num_valid_tokens = int(sequence_length * max_ratio_of_valid_token) + min_num_valid_tokens = max(1, int(sequence_length * min_ratio_of_valid_token)) + max_left_padding = int(sequence_length * max_ratio_of_left_padding) + assert max_num_valid_tokens + max_left_padding <= sequence_length + assert max_num_valid_tokens > 0 and max_ratio_of_valid_token <= sequence_length + masks = torch.ones_like(input_ids, dtype=torch.int64) + # TODO: we can make this faster + for i in range(batch_size): + num_left_padding = np.random.randint(low=0, high=max_left_padding + 1, dtype=np.int64) + num_valid = np.random.randint(low=min_num_valid_tokens, high=max_num_valid_tokens + 1, dtype=np.int64) + + for index in range(num_left_padding): + masks[i, index] = 0 + + for index in range(num_left_padding + num_valid, sequence_length): + masks[i, index] = 0 + return masks + + +def compute_position_id_with_mask(mask): + return torch.clip(torch.cumsum(mask, dim=-1) - 1, min=0, max=None) + + +def normalize_pp_vpp_params(params, num_hidden_layers, layer_name='layers'): + """ + Normalize the pp vpp params into a complete named parameters. + This is useful when gather parameters from pp ranks and passed to a model without pp + + params: List[List[Dict[str, param]]] + params contains a list of pp, with a list of vpp named_parameters in each vpp chunk. + output: Dict[str, param] + + """ + + def normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_layers): + """ + Transform the model name in each model_chunk in each pp stage into the name in inference engine + """ + if vpp_size > 1: + # print(f'try to bind vpp params to inference engine...') + layers_per_pp = num_layers // pp_size + layers_per_vpp = layers_per_pp // vpp_size + pp_offset = layers_per_vpp * pp_rank + vpp_offset = (layers_per_vpp * pp_size) * vpp_rank + layer_offset = pp_offset + vpp_offset + else: + layers_per_pp = num_layers // pp_size + layer_offset = layers_per_pp * pp_rank + + if layer_name in name: # belong to an intermediate layer + split_name = name.split('.') + # find the num next to split_name + for i, name in enumerate(split_name): + if name == layer_name: + break + layer_num_idx = i + 1 + # check the name + assert len(split_name) >= layer_num_idx + 1, f'split_name = {split_name}' + assert split_name[layer_num_idx].isdigit(), f'split_name = {split_name}' + # increment layer_num_idx by layer_offset + split_name[layer_num_idx] = str(int(split_name[layer_num_idx]) + layer_offset) + name = '.'.join(split_name) # weight name in inference_tp_model + return name + + pp_size = len(params) + normalized_name_to_param = {} + for pp_rank in range(len(params)): + vpp_size = len(params[pp_rank]) + for vpp_rank in range(vpp_size): + for name, param in params[pp_rank][vpp_rank].items(): + normalized_name = normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_hidden_layers) + normalized_name_to_param[normalized_name] = param + + return normalized_name_to_param + + +def get_parallel_model_from_config(config, megatron_config, pre_process=None, post_process=None, value=False): + from megatron.core import ModelParallelConfig + assert isinstance(megatron_config, ModelParallelConfig) + model_class = _get_parallel_model_architecture_from_config(config, value) + + model = model_class(config, megatron_config, pre_process=pre_process, post_process=post_process) + return model + + +def _get_parallel_model_architecture_from_config(config: PretrainedConfig, value=False) -> Type[nn.Module]: + architectures = getattr(config, "architectures", []) + for arch in architectures: + model_cls = ModelRegistry.load_model_cls(arch, value) + if model_cls is not None: + return model_cls + raise ValueError(f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {ModelRegistry.get_supported_archs()}") + + +def load_megatron_model_weights(config, + model_config, + parallel_model, + params_dtype, + is_value_model=False, + local_cache_path='~/.cache/verl/rlhf'): + assert hasattr(model_config, "architectures"), "architectures cannot be empty when load weight!" + architectures = getattr(model_config, "architectures", []) + local_cache_path = os.path.expanduser(local_cache_path) + + if config.model.path.startswith("hdfs:"): + from verl.utils.fs import copy_local_path_from_hdfs + print(f'start download from {config.model.path}') + local_model_path = copy_local_path_from_hdfs(src=config.model.path, cache_dir=local_cache_path) + print('finish download') + else: + print(f"load from local dir {config.model.path}") + local_model_path = config.model.path + + # TODO: to find a better way to load mistral7b-rm lm_head + if 'mistral7b-rm' in config.model.path: + model = MistralForSequenceClassification.from_pretrained(local_model_path) # use score head instead of lm_head + state_dict = model.state_dict() + state_dict['lm_head.weight'] = state_dict['score.weight'] + state_dict['model.embed_tokens.weight'] = state_dict[ + 'model.embed_tokens.weight'][:32000] # workaround, 32001 -> 32000 + is_value_model = True + else: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + model = AutoModelForCausalLM.from_pretrained(local_model_path) + state_dict = model.state_dict() + + from verl.models.weight_loader_registry import get_weight_loader + print(f'before weight loader: architectures = {architectures}...') + for arch in architectures: + print(f'call weight loader arch = {arch}, model config = {model.config}') + weight_loader = get_weight_loader(arch) + weight_loader(state_dict=state_dict, + wrapped_models=parallel_model, + config=model.config, + params_dtype=params_dtype, + is_value_model=is_value_model) + + +# pad input_ids_rmpad, cu_seqlens and max_seqlen_in_batch to be divisible by tp +def pad_packed_inputs(unpad_tokens: torch.Tensor, cu_seqlens, max_seqlen_in_batch, size): + """pad the tokens such that the total length is a multiple of size. + This function is useful when applying sequence parallel and context parallel + + Args: + unpad_tokens: (total_nnz, ...). Tokens after removing padding + cu_seqlens: (total_nnz + 1,) + max_seqlen_in_batch: int + + Returns: + + """ + F = nn.functional + + total_nnz = unpad_tokens.shape[0] + + if total_nnz % size == 0: + pad_size = 0 + else: + pad_size = size - total_nnz % size + + # we assume adding a new data in the batch with seqlen pad_size + if pad_size > 0: + if unpad_tokens.ndim == 1: + unpad_tokens = F.pad(unpad_tokens, (0, pad_size)) + elif unpad_tokens.ndim == 2: + unpad_tokens = F.pad(unpad_tokens, (0, 0, 0, pad_size)) + else: + raise NotImplementedError(f'Padding dim {unpad_tokens.ndim()} is not supported') + + cu_seqlens = F.pad(cu_seqlens, (0, 1), value=pad_size + cu_seqlens[-1]) + max_seqlen_in_batch = max(max_seqlen_in_batch, pad_size) + + return unpad_tokens, cu_seqlens, max_seqlen_in_batch diff --git a/code/RL_model/verl/Search-R1/verl/utils/py_functional.py b/code/RL_model/verl/Search-R1/verl/utils/py_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..8f5a0e176779cc19d3035a3af77a1bdf1f39349a --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/py_functional.py @@ -0,0 +1,56 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Contain small python utility functions +""" + +from typing import Dict +from types import SimpleNamespace + + +def union_two_dict(dict1: Dict, dict2: Dict): + """Union two dict. Will throw an error if there is an item not the same object with the same key. + + Args: + dict1: + dict2: + + Returns: + + """ + for key, val in dict2.items(): + if key in dict1: + assert dict2[key] == dict1[key], \ + f'{key} in meta_dict1 and meta_dict2 are not the same object' + dict1[key] = val + + return dict1 + + +def append_to_dict(data: Dict, new_data: Dict): + for key, val in new_data.items(): + if key not in data: + data[key] = [] + data[key].append(val) + + +class NestedNamespace(SimpleNamespace): + + def __init__(self, dictionary, **kwargs): + super().__init__(**kwargs) + for key, value in dictionary.items(): + if isinstance(value, dict): + self.__setattr__(key, NestedNamespace(value)) + else: + self.__setattr__(key, value) diff --git a/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py b/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9a75df6c37bc5a295aaa192b2a56cca2423e94b9 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py @@ -0,0 +1,43 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Contains commonly used utilities for ray +""" + +import ray + +import concurrent.futures + + +def parallel_put(data_list, max_workers=None): + + def put_data(index, data): + return index, ray.put(data) + + if max_workers is None: + max_workers = min(len(data_list), 16) + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + data_list_f = [executor.submit(put_data, i, data) for i, data in enumerate(data_list)] + res_lst = [] + for future in concurrent.futures.as_completed(data_list_f): + res_lst.append(future.result()) + + # reorder based on index + output = [None for _ in range(len(data_list))] + for res in res_lst: + index, data_ref = res + output[index] = data_ref + + return output diff --git a/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py b/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py new file mode 100644 index 0000000000000000000000000000000000000000..fee45da0d33264ea40591f95a98bdf35ef0ea4ad --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py @@ -0,0 +1,265 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple, Callable +import heapq + +import torch +from torch import distributed as dist + +from tensordict import TensorDict +import copy + + +def karmarkar_karp(seqlen_list: List[int], k_partitions: int, equal_size: bool): + # see: https://en.wikipedia.org/wiki/Largest_differencing_method + class Set: + + def __init__(self) -> None: + self.sum = 0 + self.items = [] + + def add(self, idx: int, val: int): + self.items.append((idx, val)) + self.sum += val + + def merge(self, other): + for idx, val in other.items: + self.items.append((idx, val)) + self.sum += val + + def __lt__(self, other): + if self.sum != other.sum: + return self.sum < other.sum + if len(self.items) != len(other.items): + return len(self.items) < len(other.items) + return self.items < other.items + + class State: + + def __init__(self, items: List[Tuple[int, int]], k: int) -> None: + self.k = k + # sets should always be decreasing order + self.sets = [Set() for _ in range(k)] + assert len(items) in [1, k], f"{len(items)} not in [1, {k}]" + for i, (idx, seqlen) in enumerate(items): + self.sets[i].add(idx=idx, val=seqlen) + self.sets = sorted(self.sets, reverse=True) + + def spread(self): + return self.sets[0].sum - self.sets[-1].sum + + def get_partitions(self): + partitions = [] + for i in range(len(self.sets)): + cur_partition = [] + for idx, _ in self.sets[i].items: + cur_partition.append(idx) + partitions.append(cur_partition) + return partitions + + def merge(self, other): + for i in range(self.k): + self.sets[i].merge(other.sets[self.k - 1 - i]) + self.sets = sorted(self.sets, reverse=True) + + @property + def spread(self) -> int: + return self.sets[0].sum - self.sets[-1].sum + + def __lt__(self, other): + # least heap, let the state with largest spread to be popped first, + # if the spread is the same, let the state who has the largest set + # to be popped first. + if self.spread != other.spread: + return self.spread > other.spread + return self.sets[0] > other.sets[0] + + def __repr__(self) -> str: + repr_str = "[" + for i in range(self.k): + if i > 0: + repr_str += "," + repr_str += "{" + for j, (_, seqlen) in enumerate(self.sets[i].items): + if j > 0: + repr_str += "," + repr_str += str(seqlen) + repr_str += "}" + repr_str += "]" + return repr_str + + sorted_seqlen_list = sorted([(seqlen, i) for i, seqlen in enumerate(seqlen_list)]) + states_pq = [] + if equal_size: + assert len(seqlen_list) % k_partitions == 0, f"{len(seqlen_list)} % {k_partitions} != 0" + for offset in range(0, len(sorted_seqlen_list), k_partitions): + items = [] + for i in range(k_partitions): + seqlen, idx = sorted_seqlen_list[offset + i] + items.append((idx, seqlen)) + heapq.heappush(states_pq, State(items=items, k=k_partitions)) + else: + for seqlen, idx in sorted_seqlen_list: + heapq.heappush(states_pq, State(items=[(idx, seqlen)], k=k_partitions)) + + while len(states_pq) > 1: + state0 = heapq.heappop(states_pq) + state1 = heapq.heappop(states_pq) + # merge states + state0.merge(state1) + heapq.heappush(states_pq, state0) + + final_state = states_pq[0] + partitions = final_state.get_partitions() + if equal_size: + for i, partition in enumerate(partitions): + assert len(partition) * \ + k_partitions == len(seqlen_list), f"{len(partition)} * {k_partitions} != {len(seqlen_list)}" + return partitions + + +def greedy_partition(seqlen_list: List[int], k_partitions: int, equal_size: bool): + bias = sum(seqlen_list) + 1 if equal_size else 0 + sorted_seqlen = [(seqlen + bias, i) for i, seqlen in enumerate(seqlen_list)] + partitions = [[] for _ in range(k_partitions)] + partition_sums = [0 for _ in range(k_partitions)] + for seqlen, i in sorted_seqlen: + min_idx = None + for j in range(k_partitions): + if min_idx is None or partition_sums[j] < partition_sums[min_idx]: + min_idx = j + partitions[min_idx].append(i) + partition_sums[min_idx] += seqlen + if equal_size: + for i, partition in enumerate(partitions): + assert len(partition) * \ + k_partitions == len(seqlen_list), f"{len(partition)} * {k_partitions} != {len(seqlen_list)}" + return partitions + + +def get_seqlen_balanced_partitions(seqlen_list: List[int], k_partitions: int, equal_size: bool): + """ get order of seq lengths to make partitions balanced, this is + used in balacing sum of seqlength across dp ranks and microbatches + Parameters: + seqlen_list (List[int]): + seq lengths of each items + k_partitions (int): + resulting number of partitions + equal_size (bool): + if True, number of items in each partitions must be equal. + if False, only consider balancing the sum, each partition can have + variable number of items + Returns: + partitions (List[List[int]]): + return k_partitions list containing the index of items. + """ + assert len(seqlen_list) >= k_partitions, f"number of items:[{len(seqlen_list)}] < k_partitions:[{k_partitions}]" + + def _check_and_sort_partitions(partitions): + assert len(partitions) == k_partitions, f"{len(partitions)} != {k_partitions}" + seen_idx = set() + sorted_partitions = [None] * k_partitions + for i, partition in enumerate(partitions): + assert len(partition) > 0, f"the {i}-th partition is empty" + for idx in partition: + seen_idx.add(idx) + sorted_partitions[i] = sorted(partition) + assert seen_idx == set(range(len(seqlen_list))) + return sorted_partitions + + partitions = karmarkar_karp(seqlen_list=seqlen_list, k_partitions=k_partitions, equal_size=equal_size) + return _check_and_sort_partitions(partitions) + + +def log_seqlen_unbalance(seqlen_list: List[int], partitions: List[List[int]], prefix): + # add some metrics of seqlen sum on dp ranks + k_partition = len(partitions) + # assert len(seqlen_list) % k_partition == 0 + batch_size = len(seqlen_list) // k_partition + min_sum_seqlen = None + max_sum_seqlen = None + total_sum_seqlen = 0 + for offset in range(0, len(seqlen_list), batch_size): + cur_sum_seqlen = sum(seqlen_list[offset:offset + batch_size]) + if min_sum_seqlen is None or cur_sum_seqlen < min_sum_seqlen: + min_sum_seqlen = cur_sum_seqlen + if max_sum_seqlen is None or cur_sum_seqlen > max_sum_seqlen: + max_sum_seqlen = cur_sum_seqlen + total_sum_seqlen += cur_sum_seqlen + + balanced_sum_seqlen_list = [] + for partition in partitions: + cur_sum_seqlen_balanced = sum([seqlen_list[i] for i in partition]) + balanced_sum_seqlen_list.append(cur_sum_seqlen_balanced) + # print("balanced_sum_seqlen_list: ", balanced_sum_seqlen_list) + min_sum_seqlen_balanced = min(balanced_sum_seqlen_list) + max_sum_seqlen_balanced = max(balanced_sum_seqlen_list) + + return { + f'{prefix}/min': min_sum_seqlen, + f'{prefix}/max': max_sum_seqlen, + f'{prefix}/minmax_diff': max_sum_seqlen - min_sum_seqlen, + f'{prefix}/balanced_min': min_sum_seqlen_balanced, + f'{prefix}/balanced_max': max_sum_seqlen_balanced, + f'{prefix}/mean': total_sum_seqlen / len(partitions) + } + + +def ceildiv(a, b): + return -(a // -b) + + +def rearrange_micro_batches(batch: TensorDict, max_token_len, dp_group=None): + """Split the batch into a list of micro_batches, where the max_token_len is smaller than max_token_len + and the number of valid tokens in each micro batch is well balanced. + """ + # this is per local micro_bsz + max_seq_len = batch['attention_mask'].shape[-1] + assert max_token_len >= max_seq_len, \ + f'max_token_len must be greater than the sequence length. Got {max_token_len=} and {max_seq_len=}' + + seq_len_effective: torch.Tensor = batch['attention_mask'].sum(dim=1) + total_seqlen = seq_len_effective.sum().item() + num_micro_batches = ceildiv(total_seqlen, max_token_len) + if dist.is_initialized(): + num_micro_batches = torch.tensor([num_micro_batches], device='cuda') + dist.all_reduce(num_micro_batches, op=dist.ReduceOp.MAX, group=dp_group) + num_micro_batches = num_micro_batches.cpu().item() + + seq_len_effective = seq_len_effective.tolist() + assert num_micro_batches <= len(seq_len_effective) + + micro_bsz_idx = get_seqlen_balanced_partitions(seq_len_effective, num_micro_batches, equal_size=False) + + micro_batches = [] + + for partition in micro_bsz_idx: + curr_micro_batch = [] + for idx in partition: + curr_micro_batch.append(batch[idx:idx + 1]) + curr_micro_batch = torch.cat(curr_micro_batch) + + micro_batches.append(curr_micro_batch) + + return micro_batches, micro_bsz_idx + + +def get_reverse_idx(idx_map): + reverse_idx_map = copy.deepcopy(idx_map) + + for i, idx in enumerate(idx_map): + reverse_idx_map[idx] = i + + return reverse_idx_map diff --git a/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py b/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..b64b6623ac62b6b3f4288dccf8f5307fc87439c7 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py @@ -0,0 +1,58 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utils for tokenization.""" +import warnings + +__all__ = ['hf_tokenizer'] + + +def set_pad_token_id(tokenizer): + """Set pad_token_id to eos_token_id if it is None. + + Args: + tokenizer (transformers.PreTrainedTokenizer): The tokenizer to be set. + + """ + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + warnings.warn(f'tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}') + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + warnings.warn(f'tokenizer.pad_token is None. Now set to {tokenizer.eos_token}') + + +def hf_tokenizer(name_or_path, correct_pad_token=True, correct_gemma2=True, **kwargs): + """Create a huggingface pretrained tokenizer. + + Args: + name (str): The name of the tokenizer. + correct_pad_token (bool): Whether to correct the pad token id. + correct_gemma2 (bool): Whether to correct the gemma2 tokenizer. + **kwargs: The keyword arguments for the tokenizer. + + Returns: + transformers.PreTrainedTokenizer: The pretrained tokenizer. + + """ + from transformers import AutoTokenizer + if correct_gemma2 and isinstance(name_or_path, str) and 'gemma-2-2b-it' in name_or_path: + # the EOS token in gemma2 is ambiguious, which may worsen RL performance. + # https://huggingface.co/google/gemma-2-2b-it/commit/17a01657f5c87135bcdd0ec7abb4b2dece04408a + warnings.warn('Found gemma-2-2b-it tokenizer. Set eos_token and eos_token_id to and 107.') + kwargs['eos_token'] = '' + kwargs['eos_token_id'] = 107 + tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs) + if correct_pad_token: + set_pad_token_id(tokenizer) + return tokenizer \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py b/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..bb63df13b9c26802dff23c92ae8e36f5c23ae4fd --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py @@ -0,0 +1,82 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Adapted from Cruise. +""" + +import torch + +from typing import Union + +HALF_LIST = [16, "16", "fp16", "float16"] +FLOAT_LIST = [32, "32", "fp32", "float32"] +BFLOAT_LIST = ["bf16", "bfloat16"] + + +class PrecisionType(object): + """Type of precision used. + + >>> PrecisionType.HALF == 16 + True + >>> PrecisionType.HALF in (16, "16") + True + """ + + HALF = "16" + FLOAT = "32" + FULL = "64" + BFLOAT = "bf16" + MIXED = "mixed" + + @staticmethod + def supported_type(precision: Union[str, int]) -> bool: + return any(x == precision for x in PrecisionType) + + @staticmethod + def supported_types() -> list[str]: + return [x.value for x in PrecisionType] + + @staticmethod + def is_fp16(precision): + return precision in HALF_LIST + + @staticmethod + def is_fp32(precision): + return precision in FLOAT_LIST + + @staticmethod + def is_bf16(precision): + return precision in BFLOAT_LIST + + @staticmethod + def to_dtype(precision): + if precision in HALF_LIST: + return torch.float16 + elif precision in FLOAT_LIST: + return torch.float32 + elif precision in BFLOAT_LIST: + return torch.bfloat16 + else: + raise RuntimeError(f"unexpected precision: {precision}") + + @staticmethod + def to_str(precision): + if precision == torch.float16: + return 'fp16' + elif precision == torch.float32: + return 'fp32' + elif precision == torch.bfloat16: + return 'bf16' + else: + raise RuntimeError(f"unexpected precision: {precision}") diff --git a/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py b/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..3d53ca7a4e40efc715ceba1f3a8c725c2fe256a0 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py @@ -0,0 +1,492 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Contain small torch utilities +""" + +from typing import Dict, Union, List, Optional + +import os +import torch +import torch.distributed +import torch.nn.functional as F +from tensordict import TensorDict +from torch import nn + +try: + from flash_attn.ops.triton.cross_entropy import cross_entropy_loss + FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE = True +except ImportError: + FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE = False + + +def gather_from_labels(data, label): + """Gather the label from data. The value in label should be [0, vocab_size) + + Args: + data: (..., vocab_size) + label (torch.IntTensor) : (...,) + + Returns: + + """ + + output = torch.gather(data, -1, label.unsqueeze(-1)).squeeze(-1) + return output + + +def logprobs_from_logits(logits, labels): + """ + See: https://github.com/pytorch/pytorch/issues/563#issuecomment-330103591 + """ + if FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE: + batch_dim = logits.shape[:-1] + last_dim = logits.shape[-1] + logits = logits.reshape(-1, last_dim) + labels = labels.reshape(-1) + output = logprobs_from_logits_flash_attn(logits, labels) + output = output.view(*batch_dim) + else: + output = logprobs_from_logits_naive(logits, labels) + return output + + +def logprobs_from_logits_flash_attn(logits, labels): + output = -cross_entropy_loss(logits, labels)[0] + return output + + +def logprobs_from_logits_naive(logits, labels): + logp = F.log_softmax(logits, dim=-1) + logpy = gather_from_labels(logp, labels) + return logpy + + +def logprobs_of_labels_v2(logits: torch.FloatTensor, labels): + """ + A memory efficient implementation of logprobs_from_logits + """ + assert logits.dtype == torch.float32, 'Using bf16 logits with logprobs_of_labels_v2 may lead to divergence' + logprobs_labels = torch.gather(logits, dim=-1, index=labels.unsqueeze(-1)) + logprobs_labels = logprobs_labels - torch.logsumexp(logits, dim=-1, keepdim=True) + return logprobs_labels.squeeze(-1) + + +def clip_by_value(x, tensor_min, tensor_max): + """ + Tensor extenstion to torch.clamp + https://github.com/pytorch/pytorch/issues/2793#issuecomment-428784713 + """ + clipped = torch.max(torch.min(x, tensor_max), tensor_min) + return clipped + + +def entropy_from_logits(logits: torch.Tensor): + """Calculate entropy from logits.""" + pd = torch.nn.functional.softmax(logits, dim=-1) + entropy = torch.logsumexp(logits, dim=-1) - torch.sum(pd * logits, dim=-1) + return entropy + + +def masked_sum(values, mask, axis=None): + """Compute mean of tensor with a masked values.""" + return (values * mask).sum(axis=axis) + + +def masked_mean(values, mask, axis=None): + """Compute mean of tensor with a masked values.""" + return (values * mask).sum(axis=axis) / mask.sum(axis=axis) + + +def masked_var(values, mask, unbiased=True): + """Compute variance of tensor with masked values.""" + mean = masked_mean(values, mask) + centered_values = values - mean + variance = masked_mean(centered_values**2, mask) + if unbiased: + mask_sum = mask.sum() + if mask_sum == 0: + raise ValueError("At least one element in the mask has to be 1.") + # note that if mask_sum == 1, then there is a division by zero issue + # to avoid it you just need to use a larger minibatch_size + if mask_sum == 1: + raise ValueError("The sum of the mask is one, which can cause a division by zero.") + bessel_correction = mask_sum / (mask_sum - 1) + variance = variance * bessel_correction + return variance + + +def masked_whiten(values, mask, shift_mean=True): + """Whiten values with masked values.""" + mean, var = masked_mean(values, mask), masked_var(values, mask) + whitened = (values - mean) * torch.rsqrt(var + 1e-8) + if not shift_mean: + whitened += mean + return whitened + + +def get_eos_mask(response_id: torch.Tensor, eos_token: int = 2, dtype=torch.int64): + ''' + e.g. end of sentence token=1 + response_id: [0, 0, 2, 42, 3, 5, 1, 0, 0] + eos_mask: [1, 1, 1, 1, 1, 1, 1, 0, 0] + ''' + eos_mask = response_id.eq(eos_token).long() + eos_mask = (torch.cumsum(eos_mask, dim=1) - eos_mask).bool() + eos_mask = torch.logical_not(eos_mask).to(dtype) + return eos_mask + + +def compute_grad_norm(model: nn.Module): + total_grad_square = 0 + total_params = 0 + for param in model.parameters(): + if param.grad is not None: + total_grad_square += torch.sum(torch.square(param.grad.detach())).item() + return total_grad_square + + +def broadcast_dict_tensor(tensors: Union[Dict[str, torch.Tensor], TensorDict], src, group): + """ + TODO: optimize this. Technically, we only need one broadcast + """ + + for key in tensors.sorted_keys: + torch.distributed.broadcast(tensors[key], src=src, group=group, async_op=False) + + +def allgather_dict_tensors(tensors: Union[Dict[str, torch.Tensor], TensorDict], size, group, dim=0): + """ + TODO: optimize this. + - We can use async ops + - We can use only one allgather + Args: + tensors: + size: + group: + + Returns: + + """ + if isinstance(tensors, TensorDict): + is_tensor_dict = True + tensors_as_dict = tensors.to_dict() + else: + tensors_as_dict = tensors + is_tensor_dict = False + + output = {} + sorted_keys = sorted(tensors_as_dict.keys()) + for key in sorted_keys: + val = tensors_as_dict[key] + output[key] = [torch.empty_like(val) for _ in range(size)] + torch.distributed.all_gather(output[key], val, group=group, async_op=False) + output[key] = torch.cat(output[key], dim=dim) + + if is_tensor_dict: + output = TensorDict(source=output, batch_size=tensors.batch_size[0] * size) + + return output + + +def split_dict_tensor_into_batches(tensors: TensorDict, batch_size) -> List[TensorDict]: + assert tensors.batch_size[0] % batch_size == 0, \ + f'input data batch size: {tensors.batch_size[0]}, split batch size: {batch_size}' + return tensors.split(batch_size) + + +def pad_sequence_to_length(tensors, max_seq_len, pad_token_id, left_pad=False): + """ + pad a 2D tensors (e.g. responses, logprobs) in the last dim to max_seq_length. + input shape: [bs, seq_length] + output shape: [bs, max_seq_length] + (0, max_seq_len - tensors.shape[-1]) means right pad to max_seq_length and no left pad + """ + if tensors.shape[-1] >= max_seq_len: + return tensors + pad_tuple = (max_seq_len - tensors.shape[-1], 0) if left_pad else (0, max_seq_len - tensors.shape[-1]) + return F.pad(tensors, pad_tuple, 'constant', pad_token_id) + + +from transformers import PreTrainedTokenizer + + +def tokenize_and_postprocess_data(prompt: str, + tokenizer: PreTrainedTokenizer, + max_length: int, + pad_token_id: int, + left_pad=True, + truncation='error'): + """ + input_data is the output from tokenizer. + """ + assert truncation in ['left', 'right', 'error'] + + input_data = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) + + input_ids = input_data['input_ids'] + attention_mask = input_data['attention_mask'] + + assert input_ids.ndim == 2 + + sequence_length = input_ids.shape[-1] + if sequence_length < max_length: + input_ids = pad_sequence_to_length(input_ids, + max_seq_len=max_length, + pad_token_id=pad_token_id, + left_pad=left_pad) + attention_mask = pad_sequence_to_length(attention_mask, + max_seq_len=max_length, + pad_token_id=0, + left_pad=left_pad) + elif sequence_length > max_length: + if truncation == 'left': + # actually, left truncation may not be reasonable + input_ids = input_ids[:, -max_length:] + attention_mask = attention_mask[:, -max_length:] + elif truncation == 'right': + input_ids = input_ids[:, :max_length] + attention_mask = attention_mask[:, :max_length] + elif truncation == 'error': + raise NotImplementedError(f'{sequence_length=} is larger than {max_length=}') + else: + raise NotImplementedError(f'Unknown truncation method {truncation}') + + return input_ids, attention_mask + + +def remove_pad_token(input_ids: torch.Tensor, attention_mask: torch.Tensor): + """ Remove the pad token. + + Args: + input_ids shape: [bs, seq_length] + attention_mask shape: [bs, seq_length] + Returns: + no_padding_batch(List[List[int]]): contains the rmpad token ids per query. + """ + no_padding_batch = [] + for ids, mask in zip(input_ids, attention_mask): + no_padding_batch.append((ids[len(ids) - mask.sum():]).cpu().numpy().tolist()) + return no_padding_batch + + +def log_probs_from_logits_response(input_ids, logits, response_length): + """Compute the response log_probs from full logits. Note that logits = model(input_ids) + + Args: + input_ids: [batch_size, seqlen] + logits: [batch_size, seqlen, vocab_size] + + Returns: + response_log_prob: + """ + response_logits = logits[:, -response_length - 1:-1] + response = input_ids[:, -response_length:] + response_log_prob = logprobs_from_logits(logits=response_logits, labels=response) + return response_log_prob + + +def log_probs_from_logits_response_rmpad(input_ids, attention_mask, logits_rmpad, response_length): + """Compute the log_probs from logits with rmpad logits and pad input. Note that + logits_rmpad = model(input_ids_rmpad). For each sentences, there is a shift between + logits and input_ids. + The reason for this function to is to compute logprobs_from_logits in rmpad mode because it is memory-intensive + for large vocab_size + + Args: + input_ids: [batch_size, seqlen] + attention_mask: [batch_size, seqlen] + logits_rmpad: [total_nnz, vocab_size] + response_length: int + """ + from flash_attn.bert_padding import pad_input, unpad_input + + batch_size, seqlen = input_ids.shape + input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask=attention_mask) + input_ids_rmpad = input_ids_rmpad.squeeze(-1) + input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=0) + full_log_probs_rmpad = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled) # (total_nnz,) + full_output = pad_input(hidden_states=full_log_probs_rmpad.unsqueeze(-1), + indices=indices, + batch=batch_size, + seqlen=seqlen) + output = full_output.squeeze(-1)[:, -response_length - 1:-1] # [batch_size, response_length] + return output + + +def log_probs_from_logits_all_rmpad(input_ids_rmpad, logits_rmpad, indices, batch_size, seqlen, response_length): + """Compute the log_probs from logits with rmpad input_ids and logits. Note that + logits_rmpad = model(input_ids_rmpad). For each sentences, there is a shift between + logits and input_ids. + The reason for this function to is to compute logprobs_from_logits in rmpad mode because it is memory-intensive + for large vocab_size + + Args: + input_ids_rmpad: [1, total_nnz] + logits_rmpad: [total_nnz, vocab_size] + indices: [total_nnz] + batch_size: int + seqlen: int + response_length: int + """ + from flash_attn.bert_padding import pad_input + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # transpose back to [total_nnz, 1] + input_ids_rmpad = input_ids_rmpad.squeeze(-1) + input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=0) + full_log_probs_rmpad = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled) # (total_nnz,) + full_output = pad_input(hidden_states=full_log_probs_rmpad.unsqueeze(-1), + indices=indices, + batch=batch_size, + seqlen=seqlen) + output = full_output.squeeze(-1)[:, -response_length - 1:-1] # [batch_size, response_length] + return output + + +from transformers.generation.logits_process import (TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper) + + +def post_process_logits(input_ids, logits, temperature, top_k, top_p): + if temperature != 1.: + logits = logits.div_(temperature) # inplace operation to avoid OOM + # TODO: add them back + # if top_k is not None and top_k > 0: + # logits = TopKLogitsWarper(top_k=top_k)(input_ids, logits) + # if top_p is not None and top_p < 1.0 and top_p > 0.0: + # logits = TopPLogitsWarper(top_p=top_p)(input_ids, logits) + return logits + + +""" +Optimizer related +""" + +from torch.optim import Optimizer +from torch.optim.lr_scheduler import LambdaLR +import math + + +def get_cosine_schedule_with_warmup( + optimizer: Optimizer, + num_warmup_steps: int, + num_training_steps: int, + min_lr_ratio: float = 0.0, + num_cycles: float = 0.5, + last_epoch: int = -1, +): + """ + Create a schedule with a learning rate that decreases following the values of the cosine function between the + initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the + initial lr set in the optimizer. + Args: + optimizer (:class:`~torch.optim.Optimizer`): + The optimizer for which to schedule the learning rate. + num_warmup_steps (:obj:`int`): + The number of steps for the warmup phase. + num_training_steps (:obj:`int`): + The total number of training steps. + min_lr_ratio (:obj:`float`, `optional`, defaults to 0.0): + The minimum lr ratio w.r.t the maximum. + num_cycles (:obj:`float`, `optional`, defaults to 0.5): + The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 + following a half-cosine). + last_epoch (:obj:`int`, `optional`, defaults to -1): + The index of the last epoch when resuming training. + Return: + :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + assert min_lr_ratio >= 0 and min_lr_ratio <= 1. + coef = (1 - min_lr_ratio) * 0.5 + intercept = (1 + min_lr_ratio) * 0.5 + + def lr_lambda(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + x = math.cos(math.pi * float(num_cycles) * 2.0 * progress) + return max(0.0, x * coef + intercept) + + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +def get_constant_schedule_with_warmup( + optimizer: Optimizer, + num_warmup_steps: int, + last_epoch: int = -1, +): + + def lr_lambda(current_step): + return min(1, float(current_step) / float(max(1, num_warmup_steps))) + + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +def prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, + tgt_len=input_shape[-1]).to(inputs_embeds.device) + combined_attention_mask = (expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + + combined_attention_mask) + + return combined_attention_mask + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +def get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) diff --git a/code/RL_model/verl/Search-R1/verl/utils/tracking.py b/code/RL_model/verl/Search-R1/verl/utils/tracking.py new file mode 100644 index 0000000000000000000000000000000000000000..b1fbd6f330451b89286644e226fb743237bc436c --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/tracking.py @@ -0,0 +1,103 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A unified tracking interface that supports logging data to different backend +""" +import dataclasses +from enum import Enum +from functools import partial +from pathlib import Path +from typing import List, Union, Dict, Any + + +class Tracking(object): + supported_backend = ['wandb', 'mlflow', 'console'] + + def __init__(self, project_name, experiment_name, default_backend: Union[str, List[str]] = 'console', config=None): + if isinstance(default_backend, str): + default_backend = [default_backend] + for backend in default_backend: + if backend == 'tracking': + import warnings + warnings.warn("`tracking` logger is deprecated. use `wandb` instead.", DeprecationWarning) + else: + assert backend in self.supported_backend, f'{backend} is not supported' + + self.logger = {} + + if 'tracking' in default_backend or 'wandb' in default_backend: + import wandb + import os + WANDB_API_KEY = os.environ.get("WANDB_API_KEY", None) + if WANDB_API_KEY: + wandb.login(key=WANDB_API_KEY) + wandb.init(project=project_name, name=experiment_name, config=config) + self.logger['wandb'] = wandb + + if 'mlflow' in default_backend: + import mlflow + mlflow.start_run(run_name=experiment_name) + mlflow.log_params(_compute_mlflow_params_from_objects(config)) + self.logger['mlflow'] = _MlflowLoggingAdapter() + + if 'console' in default_backend: + from verl.utils.logger.aggregate_logger import LocalLogger + self.console_logger = LocalLogger(print_to_console=True) + self.logger['console'] = self.console_logger + + def log(self, data, step, backend=None): + for default_backend, logger_instance in self.logger.items(): + if backend is None or default_backend in backend: + logger_instance.log(data=data, step=step) + + +class _MlflowLoggingAdapter: + + def log(self, data, step): + import mlflow + mlflow.log_metrics(metrics=data, step=step) + + +def _compute_mlflow_params_from_objects(params) -> Dict[str, Any]: + if params is None: + return {} + + return _flatten_dict(_transform_params_to_json_serializable(params, convert_list_to_dict=True), sep='/') + + +def _transform_params_to_json_serializable(x, convert_list_to_dict: bool): + _transform = partial(_transform_params_to_json_serializable, convert_list_to_dict=convert_list_to_dict) + + if dataclasses.is_dataclass(x): + return _transform(dataclasses.asdict(x)) + if isinstance(x, dict): + return {k: _transform(v) for k, v in x.items()} + if isinstance(x, list): + if convert_list_to_dict: + return {'list_len': len(x)} | {f'{i}': _transform(v) for i, v in enumerate(x)} + else: + return [_transform(v) for v in x] + if isinstance(x, Path): + return str(x) + if isinstance(x, Enum): + return x.value + + return x + + +def _flatten_dict(raw: Dict[str, Any], *, sep: str) -> Dict[str, Any]: + import pandas as pd + ans = pd.json_normalize(raw, sep=sep).to_dict(orient='records')[0] + assert isinstance(ans, dict) + return ans diff --git a/code/RL_model/verl/Search-R1/verl/utils/ulysses.py b/code/RL_model/verl/Search-R1/verl/utils/ulysses.py new file mode 100644 index 0000000000000000000000000000000000000000..c085becc591d29a9517966cdee601843bdf24371 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/utils/ulysses.py @@ -0,0 +1,288 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities for DeepSpeed Ulysses Sequence Parallelism. +DeepSpeed Ulysses Paper: https://arxiv.org/abs/2309.14509 +Inspired from: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py +""" +from typing import Any, Optional, List, Tuple + +import torch +from torch import Tensor +import torch.distributed as dist +from torch.distributed import ProcessGroup + +_ULYSSES_SEQUENCE_PARALLEL_GROUP = None + + +def set_ulysses_sequence_parallel_group(group: dist.ProcessGroup): + """ + Set ulysses sequence parallel process group. + """ + global _ULYSSES_SEQUENCE_PARALLEL_GROUP + _ULYSSES_SEQUENCE_PARALLEL_GROUP = group + + +def get_ulysses_sequence_parallel_group() -> Optional[dist.ProcessGroup]: + """ + Get ulysses sequence parallel process group. + """ + global _ULYSSES_SEQUENCE_PARALLEL_GROUP + return _ULYSSES_SEQUENCE_PARALLEL_GROUP + + +def get_ulysses_sequence_parallel_world_size(group: ProcessGroup = None) -> int: + """ + Get ulysses sequence parallel world size. + """ + group = get_ulysses_sequence_parallel_group() if group is None else group + return dist.get_world_size(group) if group else 1 + + +def get_ulysses_sequence_parallel_rank(group: ProcessGroup = None) -> int: + """ + Get ulysses sequence parallel rank. + """ + group = get_ulysses_sequence_parallel_group() if group is None else group + return dist.get_rank(group) if group else 0 + + +def gather_seq_scatter_heads( + x: Tensor, + seq_dim: int, + head_dim: int, + unpadded_dim_size: int = 0, + group: ProcessGroup = None, +) -> Tensor: + """ + A func to sync embedding input with alltoall in sequence parallel + gather sequence dimension and scatter head dim: + e.g. seq_dim: 1, head_dim: 2 + [bsz, seq/n, h, ...] -> [bsz, seq, h/n, ...] + """ + group = get_ulysses_sequence_parallel_group() if group is None else group + if not group: + return x + sp_world = get_ulysses_sequence_parallel_world_size(group) + x = SeqAllToAll.apply(group, x, head_dim, seq_dim) + if unpadded_dim_size and unpadded_dim_size % sp_world != 0: + padding_size = x.size(seq_dim) - unpadded_dim_size + x = _unpad_tensor(x, seq_dim, padding_size) + return x + + +def gather_heads_scatter_seq(x: Tensor, head_dim: int, seq_dim: int, group: ProcessGroup = None) -> Tensor: + """ + A func to sync attention result with alltoall in sequence parallel + gather head dimension and scatter seq dim: + e.g. seq_dim: 1, head_dim: 2 + [bsz, seq, h/n, ...] -> [bsz, seq/n, h, ...] + """ + group = get_ulysses_sequence_parallel_group() if group is None else group + if not group: + return x + dim_size = x.size(seq_dim) + sp_world = get_ulysses_sequence_parallel_world_size(group) + if dim_size % sp_world != 0: + padding_size = sp_world - (dim_size % sp_world) + x = _pad_tensor(x, seq_dim, padding_size) + return SeqAllToAll.apply(group, x, seq_dim, head_dim, False) + + +def _pad_tensor(x: Tensor, dim: int, padding_size: int) -> Tensor: + shape = list(x.shape) + shape[dim] = padding_size + pad = torch.zeros(shape, dtype=x.dtype, device=x.device) + return torch.cat([x, pad], dim=dim) + + +def _unpad_tensor(x: Tensor, dim: int, padding_size: int) -> Tensor: + slc = [slice(None)] * len(x.shape) + slc[dim] = slice(0, -padding_size) + return x[slc] + + +def slice_input_tensor(x: Tensor, dim: int, padding: bool = True, group: ProcessGroup = None) -> Tensor: + group = get_ulysses_sequence_parallel_group() if group is None else group + sp_world_size = dist.get_world_size(group) + sp_rank = get_ulysses_sequence_parallel_rank() + dim_size = x.size(dim) + # pad before slice + if padding and dim_size % sp_world_size: + padding_size = sp_world_size - (dim_size % sp_world_size) + x = _pad_tensor(x, dim, padding_size) + # slice the input tensor + parts = x.size(dim) // sp_world_size + slc = [slice(None)] * len(x.shape) + slc[dim] = slice(sp_rank * parts, (sp_rank + 1) * parts) + return x[slc].contiguous() + + +def all_to_all_tensor( + local_input: Tensor, + scatter_dim: int, + gather_dim: int, + group: Optional[dist.ProcessGroup] = None, + async_op: bool = False, +): + group = get_ulysses_sequence_parallel_group() if group is None else group + seq_world_size = dist.get_world_size(group) + input_list = [t.contiguous() for t in torch.tensor_split(local_input, seq_world_size, scatter_dim)] + output_list = [torch.empty_like(input_list[0]) for _ in range(seq_world_size)] + comm = dist.all_to_all(output_list, input_list, group=group, async_op=async_op) + if async_op: + + def wait(): + comm.wait() + return torch.cat(output_list, dim=gather_dim).contiguous() + + return wait + return torch.cat(output_list, dim=gather_dim).contiguous() + + +def all_gather_tensor(local_tensor: Tensor, group: Optional[dist.ProcessGroup] = None, async_op: bool = False): + group = get_ulysses_sequence_parallel_group() if group is None else group + sp_world_size = dist.get_world_size(group=group) + output_shape = list(local_tensor.shape) + output_shape[0] = output_shape[0] * sp_world_size + output = torch.empty(output_shape, dtype=local_tensor.dtype, device=local_tensor.device) + dist.all_gather_into_tensor(output, local_tensor, group=group, async_op=async_op) + return output + + +class SeqAllToAll(torch.autograd.Function): + + @staticmethod + def forward( + ctx: Any, + group: dist.ProcessGroup, + local_input: Tensor, + scatter_dim: int, + gather_dim: int, + async_op: bool = False, + ) -> Tensor: + ctx.group = group + ctx.scatter_dim = scatter_dim + ctx.gather_dim = gather_dim + ctx.async_op = async_op + return all_to_all_tensor(local_input, scatter_dim, gather_dim, group, async_op) + + @staticmethod + def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]: + if ctx.async_op: + input_t = torch.cat(grad_output[1:], dim=ctx.gather_dim).contiguous() + else: + input_t = grad_output[0] + return ( + None, + all_to_all_tensor(input_t, ctx.gather_dim, ctx.scatter_dim, ctx.group, False), + None, + None, + None, + None, + ) + + +class Gather(torch.autograd.Function): + + @staticmethod + def forward(ctx: Any, + group: dist.ProcessGroup, + local_tensor: Tensor, + gather_dim: int, + grad_scaler: bool = True, + async_op=False) -> Tensor: + ctx.group = group + ctx.gather_dim = gather_dim + ctx.grad_scaler = grad_scaler + ctx.async_op = async_op + + sp_world_size = dist.get_world_size(group=group) + ctx.sp_world_size = sp_world_size + + sp_rank = dist.get_rank(group=group) + ctx.sp_rank = sp_rank + + local_shape = list(local_tensor.size()) + split_size = local_shape[0] + part_size = local_shape[gather_dim] # store original size + ctx.part_size = part_size + + output = all_gather_tensor(local_tensor, group, async_op) + return torch.cat(output.split(split_size, dim=0), dim=gather_dim) + + @staticmethod + def backward(ctx: Any, grad_output: Tensor) -> Any: + if ctx.grad_scaler: + grad_output = grad_output * ctx.sp_world_size + return (None, grad_output.split(ctx.part_size, + dim=ctx.gather_dim)[ctx.sp_rank].contiguous(), None, None, None, None) + + +def gather_outpus_and_unpad(x: Tensor, + gather_dim: int, + unpad_dim: int = None, + padding_size: int = 0, + grad_scaler: bool = True, + group: Optional[dist.ProcessGroup] = None): + group = get_ulysses_sequence_parallel_group() if group is None else group + sp_size = get_ulysses_sequence_parallel_world_size() + if group == None: + return x + x = Gather.apply(group, x, gather_dim, grad_scaler) + if unpad_dim is not None: + assert isinstance(padding_size, int), 'padding size is not given or is not an integer' + if padding_size == 0: + return x + x = _unpad_tensor(x, unpad_dim, padding_size) + return x + + +def ulysses_pad_and_slice_inputs(input_ids_rmpad: torch.Tensor, + position_ids_rmpad: Optional[torch.Tensor] = None, + sp_size: int = 1): + """ + Pad and slice input_ids to be divisible by sp_size + Pad position_ids to be divisible by sp_size. + + Note both input_ids_rmpad and position_ids_rmpad will be padded, + but only input_ids will be sliced. + + The is the utility of pre-forward for ulysses sequence parallelism + + Args: + input_ids_rmpad: shape of [bsz, seqlen] + position_ids_rmpad: shape of [bsz, seqlen], where bsz must be 1 + sp_size (int): ulysses sequence parallelism size + + Returns: + torch.Tensor: padded and sliced input_ids + torch.Tensor: padded and sliced position_ids + int: pad size + """ + if position_ids_rmpad is not None: + assert position_ids_rmpad.size(0) == 1 + assert input_ids_rmpad.size(1) == position_ids_rmpad.size(1) + if sp_size <= 1: + return input_ids_rmpad, position_ids_rmpad, 0 + _, total_seq_len = input_ids_rmpad.shape + pad_size = (sp_size - total_seq_len % sp_size) % sp_size + if pad_size > 0: + input_ids_rmpad = torch.nn.functional.pad(input_ids_rmpad, (0, pad_size), value=0) + if position_ids_rmpad is not None: + pad_pos_ids = torch.arange(pad_size, device=position_ids_rmpad.device).unsqueeze(0) + position_ids_rmpad = torch.cat((position_ids_rmpad, pad_pos_ids), dim=-1) + # we don't need to slice position ids + input_ids_rmpad = slice_input_tensor(input_ids_rmpad, dim=1, padding=False) + return input_ids_rmpad, position_ids_rmpad, pad_size diff --git a/code/RL_model/verl/Search-R1/verl/workers/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/workers/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py b/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py new file mode 100644 index 0000000000000000000000000000000000000000..e5ba4ea39448b3b4af59f5340f75212761ca4e72 --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py @@ -0,0 +1,1054 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The main entry point to run the PPO algorithm +""" + +import logging +import os +import warnings + +import torch +import torch.distributed +import verl.utils.hdfs_io as hdfs_io +import verl.utils.torch_functional as verl_F +from omegaconf import DictConfig, open_dict +from verl import DataProto +from verl.single_controller.base import Worker +from verl.single_controller.base.decorator import register, Dispatch +from verl.utils import hf_tokenizer +from verl.utils.debug import log_gpu_memory_usage +from verl.utils.fs import copy_local_path_from_hdfs +from verl.utils.fsdp_utils import get_fsdp_wrap_policy, offload_fsdp_grad, init_fn, get_init_weight_context_manager +from verl.utils.fsdp_utils import offload_fsdp_optimizer, offload_fsdp_param_and_grad, load_fsdp_optimizer, \ + load_fsdp_param_and_grad +from verl.utils.import_utils import import_external_libs +from verl.utils.model import compute_position_id_with_mask +from verl.utils.flops_counter import FlopsCounter +from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager + +from codetiming import Timer + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN')) + + +class ActorRolloutRefWorker(Worker): + """ + This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy + or a hybrid engine based on the config.rollout + """ + + def __init__(self, config: DictConfig, role: str): + super().__init__() + self.config = config + import torch.distributed + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl") + + # build device mesh for FSDP + world_size = torch.distributed.get_world_size() + from torch.distributed.device_mesh import init_device_mesh + # TODO(sgm): support FSDP hybrid shard for larger model + self.device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp']) + + # build device mesh for Ulysses Sequence Parallel + self.ulysses_device_mesh = None + self.ulysses_sequence_parallel_size = self.config.actor.get('ulysses_sequence_parallel_size', 1) + dp = world_size // self.ulysses_sequence_parallel_size + if self.ulysses_sequence_parallel_size > 1: + self.ulysses_device_mesh = init_device_mesh('cuda', + mesh_shape=(dp, self.ulysses_sequence_parallel_size), + mesh_dim_names=['dp', 'sp']) + + self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh) + + self.role = role + assert self.role in ['actor', 'rollout', 'ref', 'actor_rollout', 'actor_rollout_ref'] + + self._is_actor = self.role in ['actor', 'actor_rollout', 'actor_rollout_ref'] + self._is_rollout = self.role in ['rollout', 'actor_rollout', 'actor_rollout_ref'] + self._is_ref = self.role in ['ref', 'actor_rollout_ref'] + + self._is_offload_param = False + self._is_offload_grad = False + self._is_offload_optimizer = False + if self._is_actor: + self._is_offload_param = self.config.actor.fsdp_config.get('param_offload', False) + self._is_offload_grad = self.config.actor.fsdp_config.get('grad_offload', False) + self._is_offload_optimizer = self.config.actor.fsdp_config.get('optimizer_offload', False) + elif self._is_ref: + # TODO: it seems that manual offload is slowly than FSDP offload + self._is_offload_param = self.config.ref.fsdp_config.get('param_offload', False) + + # normalize config + if self._is_actor: + self.config.actor.ppo_mini_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size) + self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] // + self.ulysses_sequence_parallel_size) + self.config.actor.ppo_mini_batch_size *= self.config.rollout.n + self.config.actor.ppo_micro_batch_size *= self.config.rollout.n + if self._is_rollout: + self.config.rollout.log_prob_micro_batch_size //= (self.device_mesh.shape[0] // + self.ulysses_sequence_parallel_size) + self.config.rollout.log_prob_micro_batch_size *= self.config.rollout.n + if self._is_ref: + self.config.ref.log_prob_micro_batch_size //= (self.device_mesh.shape[0] // + self.ulysses_sequence_parallel_size) + self.config.ref.log_prob_micro_batch_size *= self.config.rollout.n + + def _build_model_optimizer(self, + model_path, + fsdp_config, + optim_config, + override_model_config, + use_remove_padding=False, + enable_gradient_checkpointing=False, + trust_remote_code=False): + from verl.utils.model import print_model_size, update_model_config + from verl.utils.torch_dtypes import PrecisionType + from transformers import AutoModelForCausalLM, AutoConfig + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision + from torch import optim + + log_gpu_memory_usage('Before init from HF AutoModel', logger=logger) + local_path = copy_local_path_from_hdfs(model_path) + + # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect + # TODO(zhangchi.usc1992): 1. support create from random initialized model. 2. Support init with FSDP directly + self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + + torch_dtype = fsdp_config.get('model_dtype', None) + if torch_dtype is None: + torch_dtype = torch.float32 if self._is_actor else torch.bfloat16 + else: + torch_dtype = PrecisionType.to_dtype(torch_dtype) + + # override model kwargs + actor_model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code) + + if use_remove_padding: + from verl.models.registry import check_model_support_rmpad + check_model_support_rmpad(actor_model_config.model_type) + + if use_remove_padding and self.ulysses_sequence_parallel_size > 1: + from verl.models.transformers.monkey_patch import apply_monkey_patch + apply_monkey_patch(actor_model_config, verbose=True) + + override_config_kwargs = { + 'bos_token_id': self.tokenizer.bos_token_id, + 'eos_token_id': self.tokenizer.eos_token_id, + 'pad_token_id': self.tokenizer.pad_token_id, + } + override_config_kwargs.update(override_model_config) + update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs) + if self.rank == 0: + print(f'Model config after override: {actor_model_config}') + + # NOTE(fix me): tie_word_embedding causes meta_tensor init to hang + init_context = get_init_weight_context_manager(use_meta_tensor=not actor_model_config.tie_word_embeddings) + + with init_context(), warnings.catch_warnings(): + warnings.simplefilter("ignore") + actor_module = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=local_path, + torch_dtype=torch_dtype, + config=actor_model_config, + attn_implementation='flash_attention_2', + trust_remote_code=trust_remote_code) + # some parameters may not in torch_dtype. TODO(zhangchi.usc1992) remove this after we switch to fsdp2 + actor_module.to(torch_dtype) + + if enable_gradient_checkpointing: + actor_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False}) + torch.distributed.barrier() + + if self.rank == 0: + print_model_size(actor_module) + + log_gpu_memory_usage('After init from HF AutoModel', logger=logger) + + # We wrap FSDP for rollout as well + mixed_precision_config = fsdp_config.get('mixed_precision', None) + if mixed_precision_config is not None: + param_dtype = PrecisionType.to_dtype(mixed_precision_config.get('param_dtype', 'bf16')) + reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get('reduce_dtype', 'fp32')) + buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get('buffer_dtype', 'fp32')) + else: + param_dtype = torch.bfloat16 + reduce_dtype = torch.float32 + buffer_dtype = torch.float32 + + mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype) + + if self._is_ref: + mixed_precision = None + + auto_wrap_policy = get_fsdp_wrap_policy(module=actor_module, config=fsdp_config.get('wrap_policy', None)) + + if self._is_rollout and self.config.rollout.name == 'hf': + # TODO(zhangchi.usc1992, shengguangming) fix me. Current, auto_wrap_policy causes HFRollout to hang in Gemma + auto_wrap_policy = None + + print(f'wrap_policy: {auto_wrap_policy}') + + # TODO(sgm): support hybrid + if auto_wrap_policy is None: + sharding_strategy = ShardingStrategy.SHARD_GRAD_OP + else: + sharding_strategy = ShardingStrategy.FULL_SHARD + + # TODO: add transformer policy + actor_module_fsdp = FSDP( + actor_module, + param_init_fn=init_fn, + use_orig_params=False, + auto_wrap_policy=auto_wrap_policy, + device_id=torch.cuda.current_device(), + sharding_strategy=sharding_strategy, # zero3 + mixed_precision=mixed_precision, + sync_module_states=True, + device_mesh=self.device_mesh, + forward_prefetch=False) + + log_gpu_memory_usage('After Actor FSDP init', logger=logger) + + # TODO: add more optimizer args into config + if self._is_actor: + from verl.utils.torch_functional import get_constant_schedule_with_warmup + actor_optimizer = optim.AdamW(actor_module_fsdp.parameters(), + lr=optim_config.lr, + betas=optim_config.get('betas', (0.9, 0.999)), + weight_decay=optim_config.get('weight_decay', 1e-2)) + + total_steps = optim_config.get('total_training_steps', 0) + num_warmup_steps_ratio = optim_config.get('lr_warmup_steps_ratio', 0.) + num_warmup_steps = int(num_warmup_steps_ratio * total_steps) + + print(f'Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}') + + actor_lr_scheduler = get_constant_schedule_with_warmup(optimizer=actor_optimizer, + num_warmup_steps=num_warmup_steps) + else: + actor_optimizer = None + actor_lr_scheduler = None + + log_gpu_memory_usage('After actor optimizer init', logger=logger) + + return actor_module_fsdp, actor_optimizer, actor_lr_scheduler, actor_model_config + + def _build_rollout(self): + from torch.distributed.device_mesh import init_device_mesh + # TODO(sgm): support FSDP hybrid shard for larger model + infer_tp = self.config.rollout.tensor_model_parallel_size + dp = self.world_size // infer_tp + assert self.world_size % infer_tp == 0, f'rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}' + rollout_device_mesh = init_device_mesh('cuda', mesh_shape=(dp, infer_tp), mesh_dim_names=['dp', 'infer_tp']) + + if self.config.rollout.name == 'hf': + from verl.workers.rollout import HFRollout + from verl.workers.sharding_manager import BaseShardingManager + rollout = HFRollout(module=self.actor_module_fsdp, config=self.config.rollout) + rollout_sharding_manager = BaseShardingManager() + # TODO: a sharding manager that do nothing? + elif self.config.rollout.name == 'vllm': + from verl.workers.rollout.vllm_rollout import vLLMRollout + from verl.workers.sharding_manager import FSDPVLLMShardingManager + log_gpu_memory_usage('Before building vllm rollout', logger=None) + rollout = vLLMRollout(actor_module=self.actor_module_fsdp, + config=self.config.rollout, + tokenizer=self.tokenizer, + model_hf_config=self.actor_model_config) + log_gpu_memory_usage('After building vllm rollout', logger=None) + if torch.distributed.get_world_size() == 1: + self.config.rollout.load_format = 'dummy_hf' + rollout_sharding_manager = FSDPVLLMShardingManager(module=self.actor_module_fsdp, + inference_engine=rollout.inference_engine, + model_config=self.actor_model_config, + full_params='hf' in self.config.rollout.load_format, + device_mesh=rollout_device_mesh) + log_gpu_memory_usage('After building sharding manager', logger=None) + + return rollout, rollout_sharding_manager + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + from verl.workers.actor import DataParallelPPOActor + # This is used to import external_lib into the huggingface systems + import_external_libs(self.config.model.get('external_lib', None)) + + from omegaconf import OmegaConf + override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create())) + + use_remove_padding = self.config.model.get('use_remove_padding', False) + + if self._is_actor or self._is_rollout: + # we need the model for actor and rollout + if self._is_actor: + optim_config = self.config.actor.optim + fsdp_config = self.config.actor.fsdp_config + else: + optim_config = None + fsdp_config = OmegaConf.create() + self.actor_module_fsdp, self.actor_optimizer, self.actor_lr_scheduler, self.actor_model_config = self._build_model_optimizer( + model_path=self.config.model.path, + fsdp_config=fsdp_config, + optim_config=optim_config, + override_model_config=override_model_config, + use_remove_padding=use_remove_padding, + enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False), + trust_remote_code=self.config.model.get('trust_remote_code', False)) + + # get the original unwrapped module + self.actor_module = self.actor_module_fsdp._fsdp_wrapped_module + + if self._is_offload_param: + # param is require during state_dict in sharding manager + offload_fsdp_grad(module=self.actor_module_fsdp) + log_gpu_memory_usage('After offload actor grad during init', logger=logger) + if self._is_offload_optimizer: + offload_fsdp_optimizer(optimizer=self.actor_optimizer) + log_gpu_memory_usage('After offload actor optimizer during init', logger=logger) + # load from checkpoint + if self._is_actor: + OmegaConf.set_struct(self.config.actor, True) + with open_dict(self.config.actor): + self.config.actor.use_remove_padding = use_remove_padding + self.actor = DataParallelPPOActor(config=self.config.actor, + actor_module=self.actor_module_fsdp, + actor_optimizer=self.actor_optimizer) + + if self._is_rollout: + self.rollout, self.rollout_sharding_manager = self._build_rollout() + + if self._is_ref: + self.ref_module_fsdp = self._build_model_optimizer(model_path=self.config.model.path, + fsdp_config=self.config.ref.fsdp_config, + optim_config=None, + override_model_config=override_model_config, + use_remove_padding=use_remove_padding, + trust_remote_code=self.config.model.get( + 'trust_remote_code', False))[0] + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad) + + OmegaConf.set_struct(self.config.ref, True) + with open_dict(self.config.ref): + self.config.ref.use_remove_padding = use_remove_padding + self.ref_policy = DataParallelPPOActor(config=self.config.ref, actor_module=self.ref_module_fsdp) + + if self._is_actor: + self.flops_counter = FlopsCounter(self.actor_model_config) + + torch.cuda.empty_cache() + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def update_actor(self, data: DataProto): + data = data.to('cuda') + + assert self._is_actor + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.actor_module_fsdp, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + if self._is_offload_optimizer: + load_fsdp_optimizer(optimizer=self.actor_optimizer, device_id=torch.cuda.current_device()) + + data.batch = data.batch.cuda() + + log_gpu_memory_usage('Before update policy', logger=logger) + + with self.ulysses_sharding_manager: + data = self.ulysses_sharding_manager.preprocess_data(data=data) + # perform training + with Timer(name='update_policy', logger=None) as timer: + metrics = self.actor.update_policy(data=data) + delta_time = timer.last + global_num_tokens = data.meta_info['global_token_num'] + estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time) + metrics['mfu/actor'] = estimated_flops * self.config.actor.ppo_epochs / promised_flops / self.world_size + + self.actor_lr_scheduler.step() + lr = self.actor_lr_scheduler.get_last_lr()[0] + metrics['actor/lr'] = lr + + log_gpu_memory_usage('After update policy', logger=logger) + + # TODO: here, we should return all metrics + output = DataProto(meta_info={'metrics': metrics}) + + output = self.ulysses_sharding_manager.postprocess_data(data=output) + output = output.to('cpu') + + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad) + if self._is_offload_optimizer: + offload_fsdp_optimizer(optimizer=self.actor_optimizer) + torch.cuda.empty_cache() + return output + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def compute_log_prob(self, data: DataProto) -> DataProto: + """mostly copying from generate_sequences""" + data = data.to('cuda') + + assert self._is_rollout + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.actor_module_fsdp, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + + data.batch = data.batch.cuda() + meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id} + data.meta_info.update(meta_info) + + with self.ulysses_sharding_manager: + data = self.ulysses_sharding_manager.preprocess_data(data) + old_log_probs = self.actor.compute_log_prob(data=data) + output = DataProto.from_dict(tensors={'old_log_probs': old_log_probs}) + output = self.ulysses_sharding_manager.postprocess_data(output) + + output = output.to('cpu') + + if self._is_offload_param: + # NOTE(sgm): the grad is already in CPU, only offload param here + offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad) + # clear kv cache + torch.cuda.empty_cache() + log_gpu_memory_usage('After recompute log prob', logger=logger) + return output + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def generate_sequences(self, prompts: DataProto): + prompts = prompts.to('cuda') + # set to False if it is validation + recompute_log_prob = prompts.meta_info.get('recompute_log_prob', True) + + assert self._is_rollout + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.actor_module_fsdp, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + + prompts.batch = prompts.batch.cuda() + meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id} + prompts.meta_info.update(meta_info) + with self.rollout_sharding_manager: + log_gpu_memory_usage('After entering rollout sharding manager', logger=logger) + + prompts = self.rollout_sharding_manager.preprocess_data(prompts) + output = self.rollout.generate_sequences(prompts=prompts) + + log_gpu_memory_usage('After rollout generation', logger=logger) + + output = self.rollout_sharding_manager.postprocess_data(output) + + if self._is_actor and recompute_log_prob: + # we should always recompute old_log_probs when it is HybridEngine + output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size + output.meta_info['max_token_len'] = self.config.rollout.log_prob_max_token_len_per_gpu + output.meta_info['use_dynamic_bsz'] = self.config.rollout.log_prob_use_dynamic_bsz + output.meta_info['temperature'] = self.config.rollout.temperature + # perform recompute log_prob + with self.ulysses_sharding_manager: + output = self.ulysses_sharding_manager.preprocess_data(output) + old_log_probs = self.actor.compute_log_prob(data=output) + output.batch['old_log_probs'] = old_log_probs + output = self.ulysses_sharding_manager.postprocess_data(output) + + output = output.to('cpu') + + if self._is_offload_param: + # NOTE(sgm): the grad is already in CPU, only offload param here + offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad) + # clear kv cache + torch.cuda.empty_cache() + log_gpu_memory_usage('After recompute log prob', logger=logger) + return output + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def compute_ref_log_prob(self, data: DataProto): + assert self._is_ref + + data = data.to('cuda') + + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.ref_module_fsdp, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + + micro_batch_size = self.config.ref.log_prob_micro_batch_size + data.meta_info['micro_batch_size'] = micro_batch_size + data.meta_info['temperature'] = self.config.rollout.temperature + data.meta_info['max_token_len'] = self.config.ref.log_prob_max_token_len_per_gpu + data.meta_info['use_dynamic_bsz'] = self.config.ref.log_prob_use_dynamic_bsz + with self.ulysses_sharding_manager: + data = self.ulysses_sharding_manager.preprocess_data(data) + output = self.ref_policy.compute_log_prob(data=data) + output = DataProto.from_dict(tensors={'ref_log_prob': output}) + output = self.ulysses_sharding_manager.postprocess_data(output) + + output = output.to('cpu') + + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad) + torch.cuda.empty_cache() + return output + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def save_checkpoint(self, local_path, hdfs_path=None): + assert self._is_actor + import torch + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.actor_module_fsdp, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + + # TODO: support DCP and save sharded checkpoints + import torch.distributed + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig + cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) + with FSDP.state_dict_type(self.actor.actor_module, StateDictType.FULL_STATE_DICT, cfg): + state_dict = self.actor.actor_module.state_dict() + if self.rank == 0: + print(f'Saving actor checkpoint to {local_path}') + os.makedirs(local_path, exist_ok=True) + self.actor_module.save_pretrained(local_path, state_dict=state_dict) + self.tokenizer.save_pretrained(local_path) + if hdfs_path is not None: + print(f'Uploading actor checkpoint to {hdfs_path}') + hdfs_io.makedirs(hdfs_path, exist_ok=True) + hdfs_io.copy(src=local_path, dst=hdfs_path) + + torch.distributed.barrier() + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad) + + +class CriticWorker(Worker): + + def __init__(self, config): + super().__init__() + import torch.distributed + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl") + self.config = config + + # build device mesh for Ulysses Sequence Parallel + world_size = torch.distributed.get_world_size() + from torch.distributed.device_mesh import init_device_mesh + self.ulysses_device_mesh = None + self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1) + dp = world_size // self.ulysses_sequence_parallel_size + if self.ulysses_sequence_parallel_size > 1: + self.ulysses_device_mesh = init_device_mesh('cuda', + mesh_shape=(dp, self.ulysses_sequence_parallel_size), + mesh_dim_names=['dp', 'sp']) + + self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh) + + # set FSDP offload params + self._is_offload_param = self.config.model.fsdp_config.param_offload + self._is_offload_grad = self.config.model.fsdp_config.grad_offload + self._is_offload_optimizer = self.config.model.fsdp_config.optimizer_offload + + # normalize config + self.config.ppo_mini_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size) + self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size) + self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() // + self.ulysses_sequence_parallel_size) + + def _build_critic_model_optimizer(self, config): + # the following line is necessary + from verl.utils.model import LambdaLayer, print_model_size, squeeze + from verl.utils.torch_dtypes import PrecisionType + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision + from torch import optim + + local_path = copy_local_path_from_hdfs(config.model.path) + # note that the tokenizer between actor and critic may be different. So override tokenizer info with actor info + # using random initialized model from any architecture. May not be the same as Actor. + + tokenizer_path = copy_local_path_from_hdfs(config.model.tokenizer_path) + self.tokenizer = hf_tokenizer(tokenizer_path, trust_remote_code=config.model.get('trust_remote_code', False)) + + from omegaconf import OmegaConf + override_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create())) + override_config_kwargs = { + 'bos_token_id': self.tokenizer.bos_token_id, + 'eos_token_id': self.tokenizer.eos_token_id, + 'pad_token_id': self.tokenizer.pad_token_id, + } + override_config_kwargs.update(override_config) + if self.rank == 0: + print(f'Critic overriding config {override_config_kwargs}') + + torch_dtype = self.config.model.fsdp_config.get('model_dtype', 'fp32') + torch_dtype = PrecisionType.to_dtype(torch_dtype) + + from transformers import AutoConfig, AutoModelForTokenClassification + from torch import nn + + trust_remote_code = False + critic_model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code) + critic_model_config.num_labels = 1 + + use_remove_padding = config.model.get('use_remove_padding', False) + if use_remove_padding: + from verl.models.registry import check_model_support_rmpad + check_model_support_rmpad(critic_model_config.model_type) + + if use_remove_padding and self.ulysses_sequence_parallel_size > 1: + from verl.models.transformers.monkey_patch import apply_monkey_patch + apply_monkey_patch(critic_model_config, verbose=True) + + init_context = get_init_weight_context_manager() + with init_context(), warnings.catch_warnings(): + warnings.simplefilter("ignore") + setattr(critic_model_config, 'classifier_dropout', 0.) + setattr(critic_model_config, 'hidden_dropout', '0') + critic_module = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=local_path, + torch_dtype=torch_dtype, + config=critic_model_config, + attn_implementation='flash_attention_2', + trust_remote_code=trust_remote_code) + + # some parameters may not in torch_dtype + critic_module.to(torch_dtype) + + if config.model.get('enable_gradient_checkpointing', False): + critic_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False}) + if self.rank == 0: + print_model_size(critic_module) + + self.critic_model_config = critic_model_config + + fsdp_config = self.config.model.fsdp_config + mixed_precision_config = fsdp_config.get('mixed_precision', None) + if mixed_precision_config is not None: + param_dtype = PrecisionType.to_dtype(mixed_precision_config.get('param_dtype', 'bf16')) + reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get('reduce_dtype', 'fp32')) + buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get('buffer_dtype', 'fp32')) + else: + param_dtype = torch.bfloat16 + reduce_dtype = torch.float32 + buffer_dtype = torch.float32 + + mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype) + + auto_wrap_policy = get_fsdp_wrap_policy(module=critic_module, config=self.config.model.fsdp_config.wrap_policy) + + log_gpu_memory_usage('Before critic FSDP', logger=None) + + critic_module = FSDP(critic_module, + param_init_fn=init_fn, + use_orig_params=False, + auto_wrap_policy=auto_wrap_policy, + device_id=torch.cuda.current_device(), + sharding_strategy=ShardingStrategy.FULL_SHARD, + mixed_precision=mixed_precision, + sync_module_states=True, + forward_prefetch=False) + + log_gpu_memory_usage('After critic FSDP', logger=None) + + critic_optimizer = optim.AdamW(critic_module.parameters(), + lr=config.optim.lr, + betas=config.optim.get('betas', (0.9, 0.999)), + weight_decay=config.optim.get('weight_decay', 1e-2)) + + total_steps = config.optim.get('total_training_steps', 0) + num_warmup_steps_ratio = config.optim.get('lr_warmup_steps_ratio', 0.) + num_warmup_steps = int(num_warmup_steps_ratio * total_steps) + + print(f'Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}') + + from verl.utils.torch_functional import get_constant_schedule_with_warmup + critic_lr_scheduler = get_constant_schedule_with_warmup(optimizer=critic_optimizer, + num_warmup_steps=num_warmup_steps) + + return critic_module, critic_optimizer, critic_lr_scheduler + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + # This is used to import external_lib into the huggingface systems + import_external_libs(self.config.model.get('external_lib', None)) + + from verl.workers.critic import DataParallelPPOCritic + self.critic_module, self.critic_optimizer, self.critic_lr_scheduler = self._build_critic_model_optimizer( + self.config) + + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad) + if self._is_offload_optimizer: + offload_fsdp_optimizer(optimizer=self.critic_optimizer) + + self.critic = DataParallelPPOCritic(config=self.config, + critic_module=self.critic_module, + critic_optimizer=self.critic_optimizer) + + self.flops_counter = FlopsCounter(self.critic_model_config) + + torch.cuda.empty_cache() + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def compute_values(self, data: DataProto): + data = data.to('cuda') + + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.critic_module, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + micro_batch_size = self.config.forward_micro_batch_size + data.meta_info['micro_batch_size'] = micro_batch_size + data.meta_info['max_token_len'] = self.config.forward_max_token_len_per_gpu + data.meta_info['use_dynamic_bsz'] = self.config.use_dynamic_bsz + # perform forward computation + with self.ulysses_sharding_manager: + data = self.ulysses_sharding_manager.preprocess_data(data=data) + values = self.critic.compute_values(data=data) + output = DataProto.from_dict(tensors={'values': values}) + output = self.ulysses_sharding_manager.postprocess_data(data=output) + + output = output.to('cpu') + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad) + torch.cuda.empty_cache() + return output + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def update_critic(self, data: DataProto): + data = data.to('cuda') + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.critic_module, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + if self._is_offload_optimizer: + load_fsdp_optimizer(optimizer=self.critic_optimizer, device_id=torch.cuda.current_device()) + + # perform forward computation + with self.ulysses_sharding_manager: + data = self.ulysses_sharding_manager.preprocess_data(data=data) + + with Timer(name='update_critic', logger=None) as timer: + metrics = self.critic.update_critic(data=data) + delta_time = timer.last + + global_num_tokens = data.meta_info['global_token_num'] + estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time) + metrics['mfu/critic'] = estimated_flops * self.config.ppo_epochs / promised_flops / self.world_size + + self.critic_lr_scheduler.step() + lr = self.critic_lr_scheduler.get_last_lr()[0] + metrics['critic/lr'] = lr + + output = DataProto(batch=None, meta_info={'metrics': metrics}) + output = self.ulysses_sharding_manager.postprocess_data(data=output) + + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad) + if self._is_offload_optimizer: + offload_fsdp_optimizer(optimizer=self.critic_optimizer) + torch.cuda.empty_cache() + output = output.to('cpu') + return output + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def save_checkpoint(self, local_path, hdfs_path=None): + import torch + if self._is_offload_param: + load_fsdp_param_and_grad(module=self.critic_module, + device_id=torch.cuda.current_device(), + load_grad=self._is_offload_grad) + + # TODO: support DCP and save sharded checkpoints + import torch.distributed + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig + cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) + with FSDP.state_dict_type(self.critic_module, StateDictType.FULL_STATE_DICT, cfg): + state_dict = self.critic_module.state_dict() + if self.rank == 0: + print(f'Saving critic checkpoint to {local_path}') + os.makedirs(local_path, exist_ok=True) + self.critic_module._fsdp_wrapped_module.save_pretrained(local_path, state_dict=state_dict) + self.tokenizer.save_pretrained(local_path) + if hdfs_path is not None: + print(f'Uploading critic checkpoint to {hdfs_path}') + hdfs_io.makedirs(hdfs_path, exist_ok=True) + hdfs_io.copy(src=local_path, dst=hdfs_path) + + torch.distributed.barrier() + if self._is_offload_param: + offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad) + + +# TODO(sgm): we may need to extract it to dp_reward_model.py +class RewardModelWorker(Worker): + """ + Note that we only implement the reward model that is subclass of AutoModelForTokenClassification. + """ + + def __init__(self, config): + super().__init__() + import torch.distributed + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl") + self.config = config + + # build device mesh for Ulysses Sequence Parallel + world_size = torch.distributed.get_world_size() + from torch.distributed.device_mesh import init_device_mesh + self.ulysses_device_mesh = None + self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1) + dp = world_size // self.ulysses_sequence_parallel_size + if self.ulysses_sequence_parallel_size > 1: + self.ulysses_device_mesh = init_device_mesh('cuda', + mesh_shape=(dp, self.ulysses_sequence_parallel_size), + mesh_dim_names=['dp', 'sp']) + + self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh) + + self.use_remove_padding = self.config.model.get('use_remove_padding', False) + self.config.micro_batch_size //= torch.distributed.get_world_size() + + def _build_model(self, config): + # the following line is necessary + from transformers import AutoModelForTokenClassification, AutoConfig + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, CPUOffload + + # download the checkpoint from hdfs + local_path = copy_local_path_from_hdfs(config.model.path) + + if self.config.model.input_tokenizer is None: + self._do_switch_chat_template = False + else: + self._do_switch_chat_template = True + input_tokenizer_local_path = copy_local_path_from_hdfs(config.model.input_tokenizer) + self.input_tokenizer = hf_tokenizer(input_tokenizer_local_path, + trust_remote_code=config.model.get('trust_remote_code', False)) + self.tokenizer = hf_tokenizer(local_path, trust_remote_code=config.model.get('trust_remote_code', False)) + + trust_remote_code = config.model.get('trust_remote_code', False) + model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code) + model_config.num_labels = 1 + + use_remove_padding = config.model.get('use_remove_padding', False) + if use_remove_padding: + from verl.models.registry import check_model_support_rmpad + check_model_support_rmpad(model_config.model_type) + + if use_remove_padding and self.ulysses_sequence_parallel_size > 1: + from verl.models.transformers.monkey_patch import apply_monkey_patch + apply_monkey_patch(model_config, verbose=True) + + # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect + init_context = get_init_weight_context_manager(use_meta_tensor=not model_config.tie_word_embeddings) + + with init_context(), warnings.catch_warnings(): + warnings.simplefilter("ignore") + setattr(model_config, 'classifier_dropout', 0.) + reward_module = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=local_path, + config=model_config, + torch_dtype=torch.bfloat16, + attn_implementation='flash_attention_2', + trust_remote_code=trust_remote_code) + reward_module.to(torch.bfloat16) + auto_wrap_policy = get_fsdp_wrap_policy(module=reward_module, config=self.config.model.fsdp_config) + + reward_module = FSDP( + reward_module, + param_init_fn=init_fn, + use_orig_params=False, + auto_wrap_policy=auto_wrap_policy, + device_id=torch.cuda.current_device(), + sharding_strategy=ShardingStrategy.FULL_SHARD, # zero3 + sync_module_states=True, + cpu_offload=CPUOffload(offload_params=self.config.model.fsdp_config.param_offload), + forward_prefetch=False) + + return reward_module + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + # This is used to import external_lib into the huggingface systems + import_external_libs(self.config.model.get('external_lib', None)) + self.reward_module = self._build_model(config=self.config) + torch.cuda.empty_cache() + + def _forward_micro_batch(self, micro_batch): + from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis, rearrange + from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad + + with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16): + input_ids = micro_batch['input_ids'] + batch_size, seqlen = input_ids.shape + attention_mask = micro_batch['attention_mask'] + position_ids = micro_batch['position_ids'] + + if self.use_remove_padding: + input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), + attention_mask) # input_ids_rmpad (total_nnz, ...) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + + # unpad the position_ids to align the rotary + position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), + indices).transpose(0, 1) + + # pad and slice the inputs if sp > 1 + if self.ulysses_sequence_parallel_size > 1: + input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs(input_ids_rmpad, \ + position_ids_rmpad, \ + sp_size=self.ulysses_sequence_parallel_size) + + # only pass input_ids and position_ids to enable flash_attn_varlen + output = self.reward_module(input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids_rmpad, + use_cache=False) # prevent model thinks we are generating + reward_rmpad = output.logits + reward_rmpad = reward_rmpad.squeeze(0) # (total_nnz) + + # gather output if sp > 1 + if self.ulysses_sequence_parallel_size > 1: + reward_rmpad = gather_outpus_and_unpad(reward_rmpad, + gather_dim=0, + unpad_dim=0, + padding_size=pad_size) + + # pad it back + rm_score = pad_input(reward_rmpad, indices=indices, batch=batch_size, seqlen=seqlen).squeeze(-1) + else: + output = self.reward_module(input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids) + rm_score = output.logits # (batch_size, seq_len, 1) + rm_score = rm_score.squeeze(-1) + + # extract the result of the last valid token + eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1) # (bsz,) + rm_score = rm_score[torch.arange(batch_size), eos_mask_idx] + return rm_score + + def _expand_to_token_level(self, data: DataProto, scores: torch.Tensor): + batch_size = data.batch.batch_size[0] + # expand as token_level_reward + attention_mask = data.batch['attention_mask'] + position_ids = data.batch['position_ids'] + response_length = data.batch['responses'].shape[-1] + eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1) # (bsz,) + token_level_scores = torch.zeros_like(attention_mask, dtype=scores.dtype) # (bsz, seqlen) + token_level_scores[torch.arange(batch_size), eos_mask_idx] = scores + + # select the response part + token_level_scores = token_level_scores[:, -response_length:] + + return token_level_scores + + def _switch_chat_template(self, data: DataProto): + src_max_length = data.batch['attention_mask'].shape[-1] + + src_tokenizer = self.input_tokenizer + target_tokenizer = self.tokenizer + + rm_input_ids = [] + rm_attention_mask = [] + + for i in range(data.batch.batch_size[0]): + # extract raw prompt + chat: list = data.non_tensor_batch['raw_prompt'][i].tolist() + + # extract response + response_ids = data.batch['responses'][i] + response_length = response_ids.shape[-1] + valid_response_length = data.batch['attention_mask'][i][-response_length:].sum() + valid_response_ids = response_ids[:valid_response_length] + + # decode + response = src_tokenizer.decode(valid_response_ids) + # remove bos and eos + response = response.replace(src_tokenizer.eos_token, '') + + chat.append({'role': 'assistant', 'content': response}) + + prompt_with_chat_template = target_tokenizer.apply_chat_template(chat, + add_generation_prompt=False, + tokenize=False) + if self.rank == 0 and i == 0: + # for debugging purpose + print(f'Switch template. chat: {prompt_with_chat_template}') + + # the maximum length is actually determined by the reward model itself + max_length = self.config.get('max_length', src_max_length) + if max_length is None: + max_length = src_max_length + input_ids, attention_mask = verl_F.tokenize_and_postprocess_data( + prompt=prompt_with_chat_template, + tokenizer=target_tokenizer, + max_length=max_length, + pad_token_id=target_tokenizer.pad_token_id, + left_pad=False, # right padding + truncation=self.config.get('truncation', 'right')) # truncate from the right + + rm_input_ids.append(input_ids) + rm_attention_mask.append(attention_mask) + + rm_input_ids = torch.cat(rm_input_ids, dim=0) + rm_attention_mask = torch.cat(rm_attention_mask, dim=0) + + rm_position_ids = compute_position_id_with_mask(rm_attention_mask) + + rm_inputs = {'input_ids': rm_input_ids, 'attention_mask': rm_attention_mask, 'position_ids': rm_position_ids} + + return DataProto.from_dict(rm_inputs) + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def compute_rm_score(self, data: DataProto): + import itertools + from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx + data = data.to('cuda') + if self._do_switch_chat_template: + rm_data = self._switch_chat_template(data) + + rm_data.batch = rm_data.batch.cuda() + + # perform forward computation + with self.ulysses_sharding_manager: + rm_data = self.ulysses_sharding_manager.preprocess_data(data=rm_data) + data = self.ulysses_sharding_manager.preprocess_data(data=data) + + use_dynamic_bsz = self.config.use_dynamic_bsz + if use_dynamic_bsz: + max_token_len = self.config.forward_max_token_len_per_gpu * self.ulysses_sequence_parallel_size + micro_batches, indices = rearrange_micro_batches(batch=rm_data.batch, max_token_len=max_token_len) + else: + micro_batches = rm_data.batch.split(self.config.micro_batch_size) + output = [] + for micro_batch in micro_batches: + rm_score = self._forward_micro_batch(micro_batch) + output.append(rm_score) + scores = torch.cat(output, dim=0) # (batch_size) + + if use_dynamic_bsz: + indices = list(itertools.chain.from_iterable(indices)) + assert len(indices) == scores.size(0), f"{len(indices)} vs. {scores.size()}" + revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long) + scores = scores[revert_indices] + + token_level_scores = self._expand_to_token_level(data, scores) + # Note that this is only the scores, may not be the final rewards used to train RL + output = DataProto.from_dict(tensors={'rm_scores': token_level_scores}) + output = self.ulysses_sharding_manager.postprocess_data(data=output) + + output = output.to('cpu') + torch.cuda.empty_cache() + return output diff --git a/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py b/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py new file mode 100644 index 0000000000000000000000000000000000000000..1143b7baa9ed1f15a9660fe892e77a57155b399e --- /dev/null +++ b/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py @@ -0,0 +1,735 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The main entry point to run the PPO algorithm +""" + +import os +import logging +import ray +import torch +import torch.distributed +import torch.nn as nn +from omegaconf import DictConfig +from verl.single_controller.base.megatron.worker import MegatronWorker +from verl.workers.actor.megatron_actor import MegatronPPOActor +from verl.workers.critic.megatron_critic import MegatronPPOCritic +from verl.workers.sharding_manager import AllGatherPPModel +from verl.workers.reward_model.megatron.reward_model import MegatronRewardModel + +from verl.single_controller.base.decorator import register, Dispatch +from verl import DataProto +from verl.utils.fs import copy_local_path_from_hdfs +from verl.utils.debug import log_gpu_memory_usage +from verl.utils.model import load_megatron_model_weights +from verl.utils.megatron_utils import init_model_parallel_config +from verl.utils.megatron_utils import offload_megatron_param_and_grad, load_megatron_param_and_grad +from verl.utils import hf_tokenizer + +from megatron.core import parallel_state as mpu +from megatron.core import ModelParallelConfig + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN')) + + +def set_random_seed(seed): + import torch + import numpy as np + import random + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + if torch.cuda.device_count() > 0: + from megatron.core import tensor_parallel + tensor_parallel.model_parallel_cuda_manual_seed(seed) + # FIXME: torch cumsum not support deterministic (used in vllm sampler), + # https://github.com/pytorch/pytorch/issues/89492 + # torch.use_deterministic_algorithms(True, warn_only=True) + # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + +class ActorRolloutRefWorker(MegatronWorker): + """ + This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy + or a hybrid engine based on the config.rollout + """ + + def __init__(self, config: DictConfig, role: str): + super().__init__() + self.config = config + + # NOTE(sgm): We utilize colocate WorkerGroup by default. + # As a result, Workers for different model share the same process. + # Therefore, we only require one distribute initialization. + # To utilize different parallel startegy in different models: + # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models, + # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385 + if not torch.distributed.is_initialized(): + rank = int(os.environ['LOCAL_RANK']) + torch.distributed.init_process_group(backend="nccl") + torch.cuda.set_device(rank) + + if self.config.actor.megatron.sequence_parallel: + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + mpu.initialize_model_parallel( + tensor_model_parallel_size=self.config.actor.megatron.tensor_model_parallel_size, + pipeline_model_parallel_size=self.config.actor.megatron.pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + use_sharp=False, + context_parallel_size=1, + expert_model_parallel_size=1, + nccl_communicator_config_path=None, + ) + + set_random_seed(seed=self.config.actor.megatron.seed) + + self.role = role + assert self.role in ['actor', 'rollout', 'ref', 'actor_rollout', 'actor_rollout_ref'] + + self._is_actor = self.role in ['actor', 'actor_rollout', 'actor_rollout_ref'] + self._is_rollout = self.role in ['rollout', 'actor_rollout', 'actor_rollout_ref'] + self._is_ref = self.role in ['ref', 'actor_rollout_ref'] + + # TODO(sgm): Currently, we only support reference model param offload + # will support other offload later + self._is_offload_param = False + self._is_offload_grad = False + self._is_offload_optimizer = False + + # normalize config + if self._is_actor and self._is_rollout: + self.config.actor.ppo_mini_batch_size //= mpu.get_data_parallel_world_size() + self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() + self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() + self._is_offload_param = self.config.actor.get('param_offload', False) + self._is_offload_grad = self.config.actor.get('grad_offload', False) + self._is_offload_optimizer = self.config.actor.get('optimizer_offload', False) + elif self._is_ref: + self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() + self._is_offload_param = self.config.ref.get('param_offload', False) + + def _build_model_optimizer(self, + model_path, + megatron_config: ModelParallelConfig, + optim_config, + override_model_config, + enable_gradient_checkpointing=False): + from verl.utils.megatron.optimizer import get_megatron_optimizer + from megatron.core.models.gpt.gpt_model import ModelType + from verl.utils.model import print_model_size, update_model_config + from verl.utils.megatron_utils import get_model, init_megatron_optim_config + from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig + + # Step 1: initialize the tokenizer + local_path = copy_local_path_from_hdfs(model_path) + self.tokenizer = hf_tokenizer(local_path) + + # Step 2: get the actor_model_config + actor_model_config = AutoConfig.from_pretrained(local_path) + + override_config_kwargs = { + 'bos_token_id': self.tokenizer.bos_token_id, + 'eos_token_id': self.tokenizer.eos_token_id, + 'pad_token_id': self.tokenizer.pad_token_id, + } + override_config_kwargs.update(override_model_config) + update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs) + + if self.rank == 0: + print(f'Model config after override: {actor_model_config}') + + def megatron_actor_model_provider(pre_process, post_process): + from verl.utils.model import get_parallel_model_from_config + # vpp is not supported yet because it will hang for some reason. Need debugging + vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model + # this_megatron_config = copy.deepcopy(megatron_config) + # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank + parallel_model = get_parallel_model_from_config(config=actor_model_config, + megatron_config=megatron_config, + pre_process=pre_process, + post_process=post_process, + value=False) + parallel_model.cuda() + return parallel_model + + # Step 3: initialize the megatron model + if self._is_actor and self._is_rollout: + # Initialize the 3D HybridEngine + hybrid_engine = AllGatherPPModel(model_provider=megatron_actor_model_provider) + # Fetch the model at current rank + actor_module = hybrid_engine.this_rank_models + if isinstance(actor_module, nn.ModuleList): + actor_module = [actor_module[0]] + if self.config.actor.load_weight: + load_megatron_model_weights(self.config, + actor_model_config, + actor_module, + params_dtype=megatron_config.params_dtype, + is_value_model=False) + + if self.rank == 0: + print_model_size(actor_module[0]) + log_gpu_memory_usage('After AllGatherPPModel init', logger=logger) + elif self._is_ref: + print(f'self.config.ref.load_weight: {self.config.ref.load_weight}') + ref_module = get_model(model_provider_func=megatron_actor_model_provider, + model_type=ModelType.encoder_or_decoder, + wrap_with_ddp=False) + # ref_module = nn.ModuleList(ref_module) + + if self.config.ref.load_weight: # should align with the actor: + assert self.config.actor.load_weight == self.config.ref.load_weight + print(f'load ref weight start') + load_megatron_model_weights(self.config, + actor_model_config, + ref_module, + params_dtype=megatron_config.params_dtype, + is_value_model=False) + log_gpu_memory_usage('After ref module init', logger=logger) + return ref_module, actor_model_config + + # TODO: add more optimizer args into config + if self._is_actor: + optim_config = init_megatron_optim_config(optim_config) + actor_optimizer = get_megatron_optimizer(model=actor_module, config=optim_config) + else: + optim_config = None + actor_optimizer = None + + log_gpu_memory_usage('After actor optimizer init', logger=logger) + + return actor_module, hybrid_engine, actor_optimizer, actor_model_config, optim_config + + def _build_rollout(self): + if self.config.rollout.name == 'vllm': + from verl.workers.rollout.vllm_rollout import vLLMRollout + from verl.workers.sharding_manager import MegatronVLLMShardingManager + from verl.utils.model import normalize_pp_vpp_params + + # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor, + # we will reorganize their weight format when resharding from actor to rollout. + layer_name_mapping = { + "qkv_layer_name": + self.config.rollout.layer_name_map.get("qkv_layer_name", "qkv"), + "gate_proj_layer_name": + self.config.rollout.layer_name_map.get("gate_proj_layer_name", "linear_fc1.weight"), + } + + # reshard the weight partition from actor to rollout to initialize the rollout class + # create a new cuda space for parameters not in this pp rank + self.hybrid_engine.load_params_to_cuda() + # broadcast the parameters from pp rank to other ranks + self.hybrid_engine.allgather_params() + # obtain name to parameters in pp/vpp + params = self.hybrid_engine.get_all_params() + # update the param name for the + params = normalize_pp_vpp_params(params=params, + num_hidden_layers=self.actor_model_config.num_hidden_layers, + layer_name='layers') + rollout = vLLMRollout(actor_module=params, + config=self.config.rollout, + tokenizer=self.tokenizer, + model_hf_config=self.actor_model_config, + train_tp=mpu.get_tensor_model_parallel_world_size()) + log_gpu_memory_usage('After building vllm rollout', logger=logger) + + # perform weight resharding between actor and rollout + sharding_manager = MegatronVLLMShardingManager(module=self.hybrid_engine, + inference_engine=rollout.inference_engine, + model_config=self.actor_model_config, + layer_name_mapping=layer_name_mapping) + log_gpu_memory_usage('After building sharding manager', logger=logger) + else: + NotImplementedError('Only vllmRollout is supported with Megatron now') + + return rollout, sharding_manager + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + if self.config.model.get('external_lib', None) is not None: + # This is used to import external_lib into the huggingface systems + import importlib + importlib.import_module(self.config.model.external_lib) + + from omegaconf import OmegaConf + from verl.utils.torch_dtypes import PrecisionType + override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create())) + torch_dtype = torch.bfloat16 + + megatron_config = OmegaConf.create({ + 'sequence_parallel': self.config.actor.megatron.get('sequence_parallel', True), + 'param_dtype': PrecisionType.to_str(torch_dtype), + 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(), + 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(), + 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(), + 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(), + 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size() + }) + + megatron_config = init_model_parallel_config(megatron_config) + + if self._is_actor or self._is_rollout: + # we need the model for actor and rollout + if self._is_actor: + optim_config = self.config.actor.optim + else: + optim_config = None + self.actor_module, self.hybrid_engine, self.actor_optimizer, \ + self.actor_model_config, self.actor_optim_config = self._build_model_optimizer( + model_path=self.config.model.path, + megatron_config=megatron_config, + optim_config=optim_config, + override_model_config=override_model_config, + ) + + if self._is_actor: + self.actor = MegatronPPOActor(config=self.config.actor, + model_config=self.actor_model_config, + megatron_config=megatron_config, + actor_module=self.actor_module, + actor_optimizer=self.actor_optimizer, + actor_optimizer_config=self.actor_optim_config) + + if self._is_rollout: + self.rollout, self.sharding_manager = self._build_rollout() + + if self._is_ref: + self.ref_module, self.ref_model_config = self._build_model_optimizer( + model_path=self.config.model.path, + megatron_config=megatron_config, + optim_config=None, + override_model_config=override_model_config, + ) + self.ref_policy = MegatronPPOActor(config=self.config.ref, + model_config=self.ref_model_config, + megatron_config=megatron_config, + actor_module=self.ref_module, + actor_optimizer=None, + actor_optimizer_config=None) + + torch.cuda.empty_cache() + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def update_actor(self, data: DataProto): + assert self._is_actor + + data.batch = data.batch.cuda() + + log_gpu_memory_usage('Before update policy', logger=logger) + + dataloader = self.actor.make_minibatch_iterator(data=data) + metrics = self.actor.update_policy(dataloader=dataloader) + + log_gpu_memory_usage('After update policy', logger=logger) + + # TODO: here, we should return all metrics + output = DataProto(meta_info={'metrics': metrics}) + output = output.to('cpu') + torch.cuda.empty_cache() + return output + + # @register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO) + # def compute_log_prob(self, data: DataProto) -> DataProto: + # assert self._is_rollout + # output = self.actor.compute_log_prob(data=data) + # output = DataProto.from_dict(tensors={'old_log_probs': output}) + # torch.cuda.empty_cache() + # return output + + @register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO) + def generate_sequences(self, prompts: DataProto): + assert self._is_rollout + + prompts.batch = prompts.batch.cuda() + meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id} + prompts.meta_info.update(meta_info) + with self.sharding_manager: + log_gpu_memory_usage('After entering sharding manager', logger=logger) + + prompts = self.sharding_manager.preprocess_data(prompts) + output = self.rollout.generate_sequences(prompts=prompts) + + log_gpu_memory_usage('After rollout generation', logger=logger) + + output = self.sharding_manager.postprocess_data(output) + + validate = prompts.meta_info.get('validate', False) + if self._is_actor and not validate: + # we should always recompute old_log_probs when it is HybridEngine + output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size + output.meta_info['temperature'] = self.config.rollout.temperature + old_log_probs = self.actor.compute_log_prob(data=output) + output.batch['old_log_probs'] = old_log_probs + + output = output.to('cpu') + # clear kv cache + torch.cuda.empty_cache() + log_gpu_memory_usage('After recompute log prob', logger=logger) + return output + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def compute_ref_log_prob(self, data: DataProto): + data = data.to('cuda') + + assert self._is_ref + if self._is_offload_param: + load_megatron_param_and_grad(self.ref_module, torch.cuda.current_device(), self._is_offload_grad) + + micro_batch_size = self.config.rollout.log_prob_micro_batch_size + data.meta_info['micro_batch_size'] = micro_batch_size + data.meta_info['temperature'] = self.config.rollout.temperature + output = self.ref_policy.compute_log_prob(data=data) + output = DataProto.from_dict(tensors={'ref_log_prob': output}) + output = output.to('cpu') + if self._is_offload_param: + offload_megatron_param_and_grad(self.ref_module, self._is_offload_grad) + torch.cuda.empty_cache() + return output + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def load_checkpoint(self, checkpoint_path): + pass + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def load_pretrained_model(self, checkpoint_path): + pass + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def save_checkpoint(self, checkpoint_path): + assert self._is_actor + pass + + +class CriticWorker(MegatronWorker): + + def __init__(self, config): + super().__init__() + self.config = config + + # NOTE(sgm): We utilize colocate WorkerGroup by default. + # As a result, Workers for different model share the same process. + # Therefore, we only require one distribute initialization. + # To utilize different parallel startegy in different models: + # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models, + # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385 + if not torch.distributed.is_initialized(): + rank = int(os.environ['LOCAL_RANK']) + torch.distributed.init_process_group(backend="nccl") + torch.cuda.set_device(rank) + + if self.config.megatron.sequence_parallel: + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + mpu.initialize_model_parallel( + tensor_model_parallel_size=self.config.megatron.tensor_model_parallel_size, + pipeline_model_parallel_size=self.config.megatron.pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + use_sharp=False, + context_parallel_size=1, + expert_model_parallel_size=1, + nccl_communicator_config_path=None, + ) + + set_random_seed(seed=self.config.megatron.seed) + + # normalize config + self.config.ppo_mini_batch_size //= mpu.get_data_parallel_world_size() + self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() + + # TODO(sgm): support critic model offload + + def _build_critic_model_optimizer(self, + model_path, + megatron_config: ModelParallelConfig, + optim_config, + override_model_config, + enable_gradient_checkpointing=False): + from megatron.core.models.gpt.gpt_model import ModelType + from verl.utils.model import print_model_size, update_model_config + from verl.utils.megatron.optimizer import get_megatron_optimizer + from verl.utils.megatron_utils import get_model, init_megatron_optim_config, init_model_parallel_config + from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig + + # Step 1: initialize the tokenizer + local_path = copy_local_path_from_hdfs(model_path) + self.tokenizer = hf_tokenizer(local_path) + + # Step 2: get the actor_model_config + critic_model_config = AutoConfig.from_pretrained(local_path) + + override_config_kwargs = { + 'bos_token_id': self.tokenizer.bos_token_id, + 'eos_token_id': self.tokenizer.eos_token_id, + 'pad_token_id': self.tokenizer.pad_token_id, + } + override_config_kwargs.update(override_model_config) + update_model_config(critic_model_config, override_config_kwargs=override_config_kwargs) + + if self.rank == 0: + print(f'Model config after override: {critic_model_config}') + + def megatron_critic_model_provider(pre_process, post_process): + from verl.utils.model import get_parallel_model_from_config + # TODO: support vpp here + # vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model + # this_megatron_config = copy.deepcopy(megatron_config) + # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank + parallel_model = get_parallel_model_from_config(config=critic_model_config, + megatron_config=megatron_config, + pre_process=pre_process, + post_process=post_process, + value=True) + parallel_model.cuda() + return parallel_model + + # Step 3: initialize the megatron model + critic_module = get_model(model_provider_func=megatron_critic_model_provider, + model_type=ModelType.encoder_or_decoder, + wrap_with_ddp=True) + # note that here critic_module will be a list to be compatible with the construction of interleaved pp (vpp). + # but here, we do not use pp (vpp) yet. For simplicity, we remove the list + # critic_module = nn.ModuleList(critic_module) + + if self.config.load_weight: + load_megatron_model_weights(self.config, + critic_model_config, + critic_module, + params_dtype=megatron_config.params_dtype, + is_value_model=True) + if self.rank == 0: + print_model_size(critic_module[0]) + + # TODO: add more optimizer args into config + optim_config = init_megatron_optim_config(optim_config) + critic_optimizer = get_megatron_optimizer(model=critic_module, config=optim_config) + torch.cuda.empty_cache() + return critic_module, critic_optimizer, critic_model_config, optim_config + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + # create critic + from omegaconf import OmegaConf + from verl.utils.torch_dtypes import PrecisionType + + if self.config.model.get('external_lib', None) is not None: + # This is used to import external_lib into the huggingface systems + import importlib + importlib.import_module(self.config.model.external_lib) + override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create())) + torch_dtype = torch.bfloat16 + + megatron_config = OmegaConf.create({ + 'sequence_parallel': self.config.megatron.get('sequence_parallel', True), + 'param_dtype': PrecisionType.to_str(torch_dtype), + 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(), + 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(), + 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(), + 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(), + 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size() + }) + + megatron_config = init_model_parallel_config(megatron_config) + + critic_module, critic_optimizer, critic_model_config, critic_optimizer_config = self._build_critic_model_optimizer( + model_path=self.config.model.path, + megatron_config=megatron_config, + optim_config=self.config.optim, + override_model_config=override_model_config) + self.critic = MegatronPPOCritic(config=self.config, + model_config=critic_model_config, + megatron_config=megatron_config, + critic_module=critic_module, + critic_optimizer=critic_optimizer, + critic_optimizer_config=critic_optimizer_config) + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def compute_values(self, data: DataProto): + data = data.to('cuda') + values = self.critic.compute_values(data=data) + output = DataProto.from_dict(tensors={'values': values}) + output = output.to('cpu') + return output + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def update_critic(self, data: DataProto): + data = data.to('cuda') + dataloader = self.critic.make_minibatch_iterator(data) + metrics = self.critic.update_critic(dataloader=dataloader) + output = DataProto(batch=None, meta_info={'metrics': metrics}) + output = output.to('cpu') + return output + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def load_checkpoint(self, checkpoint_path): + pass + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def save_checkpoint(self, checkpoint_path): + pass + + +class RewardModelWorker(MegatronWorker): + """ + Note that we only implement the reward model that is subclass of AutoModelForSequenceClassification. + """ + + def __init__(self, config): + super().__init__() + self.config = config + + # NOTE(sgm): We utilize colocate WorkerGroup by default. + # As a result, Workers for different model share the same process. + # Therefore, we only require one distribute initialization. + # To utilize different parallel startegy in different models: + # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models, + # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385 + if not torch.distributed.is_initialized(): + rank = int(os.environ['LOCAL_RANK']) + torch.distributed.init_process_group(backend="nccl") + torch.cuda.set_device(rank) + + if self.config.megatron.sequence_parallel: + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + mpu.initialize_model_parallel( + tensor_model_parallel_size=self.config.megatron.tensor_model_parallel_size, + pipeline_model_parallel_size=self.config.megatron.pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + use_sharp=False, + context_parallel_size=1, + expert_model_parallel_size=1, + nccl_communicator_config_path=None, + ) + + set_random_seed(seed=self.config.megatron.seed) + + # normalize config + self.config.micro_batch_size //= mpu.get_data_parallel_world_size() + + def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config): + from megatron.core.models.gpt.gpt_model import ModelType + from verl.utils.model import print_model_size, update_model_config + from verl.utils.megatron_utils import get_model + from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig + + # Step 1: initialize the tokenizer + local_path = copy_local_path_from_hdfs(model_path) + self.tokenizer = hf_tokenizer(local_path) + + # Step 2: get the actor_model_config + rm_model_config = AutoConfig.from_pretrained(local_path) + + override_config_kwargs = { + 'bos_token_id': self.tokenizer.bos_token_id, + 'eos_token_id': self.tokenizer.eos_token_id, + 'pad_token_id': self.tokenizer.pad_token_id, + } + override_config_kwargs.update(override_model_config) + update_model_config(rm_model_config, override_config_kwargs=override_config_kwargs) + + if self.rank == 0: + print(f'Model config after override: {rm_model_config}') + + def megatron_rm_model_provider(pre_process, post_process): + from verl.utils.model import get_parallel_model_from_config + # vpp is not supported yet because it will hang for some reason. Need debugging + vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model + # this_megatron_config = copy.deepcopy(megatron_config) + # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank + parallel_model = get_parallel_model_from_config(config=rm_model_config, + megatron_config=megatron_config, + pre_process=pre_process, + post_process=post_process, + value=True) + parallel_model.cuda() + return parallel_model + + # Step 3: initialize the megatron model + reward_model = get_model(model_provider_func=megatron_rm_model_provider, + model_type=ModelType.encoder_or_decoder, + wrap_with_ddp=False) + # note that here critic_module will be a list to be compatible with the construction of interleaved pp (vpp). + # but here, we do not use pp (vpp) yet. For simplicity, we remove the list + # reward_model = nn.ModuleList(reward_model) + + if self.config.load_weight: + load_megatron_model_weights(self.config, + rm_model_config, + reward_model, + params_dtype=megatron_config.params_dtype, + is_value_model=True) + + # TODO: add more optimizer args into config + torch.cuda.empty_cache() + return reward_model, rm_model_config + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + # create critic + from omegaconf import OmegaConf + from verl.utils.torch_dtypes import PrecisionType + from transformers import AutoTokenizer + + if self.config.model.get('external_lib', None) is not None: + # This is used to import external_lib into the huggingface systems + import importlib + importlib.import_module(self.config.model.external_lib) + override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create())) + + sft_tokenizer_local_path = copy_local_path_from_hdfs(self.config.model.input_tokenizer) + sft_tokenizer = hf_tokenizer(sft_tokenizer_local_path) + rm_tokenizer_path = self.config.model.get('rm_tokenizer', None) + rm_tokenizer = None + if rm_tokenizer_path is not None: + rm_tokenizer_local_path = copy_local_path_from_hdfs(rm_tokenizer_path) + rm_tokenizer = hf_tokenizer(rm_tokenizer_local_path) + + torch_dtype = torch.bfloat16 + + megatron_config = OmegaConf.create({ + 'sequence_parallel': self.config.megatron.get('sequence_parallel', True), + 'param_dtype': PrecisionType.to_str(torch_dtype), + 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(), + 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(), + 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(), + 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(), + 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size() + }) + + megatron_config = init_model_parallel_config(megatron_config) + + reward_model_module, reward_model_config = self._build_rm_model( + model_path=self.config.model.path, + megatron_config=megatron_config, + override_model_config=override_model_config, + ) + # FIXME(sgm): reward model param offload is implemented in MegatronRewardModel + # should be implemented in workers + self.rm = MegatronRewardModel(config=self.config, + reward_model_module=reward_model_module, + model_config=reward_model_config, + megatron_config=megatron_config, + sft_tokenizer=sft_tokenizer, + rm_tokenizer=rm_tokenizer) + + # TODO: reward model use itself tokenizer instead of sft tokenizer + # the input_ids, responses, attention_mask and position_ids may be different! + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def compute_rm_score(self, data: DataProto): + data.batch = data.batch.cuda() + output = self.rm.compute_reward(data) + output = output.to('cpu') + return output diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/output.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..af209dadde9cd85183855f02f579d64ec3a6e363 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/output.log @@ -0,0 +1,23 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.ref_init_model() (pid=1552808, ip=172.16.34.29, actor_id=e65619ea51238e1c4c82195501000000, repr=) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py", line 399, in func + return getattr(self.worker_dict[key], name)(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 286, in init_model + from verl.workers.actor import DataParallelPPOActor + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/actor/__init__.py", line 16, in + from .dp_actor import DataParallelPPOActor + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/actor/dp_actor.py", line 34, in + from flash_attn.bert_padding import pad_input, unpad_input, rearrange, index_first_axis + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/flash_attn/__init__.py", line 3, in + from flash_attn.flash_attn_interface import ( + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/flash_attn/flash_attn_interface.py", line 15, in + import flash_attn_2_cuda as flash_attn_gpu +ImportError: /home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/flash_attn_2_cuda.cpython-312-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/requirements.txt b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5300dcb7d76408546b372027fa1fbbd53b54e600 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/requirements.txt @@ -0,0 +1,288 @@ +verl==0.1 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +pyasn1==0.6.1 +virtualenv==20.35.4 +requests==2.32.5 +nvidia-cufft-cu12==11.0.2.54 +nvidia-cufile-cu12==1.13.1.3 +verl==0.1 +ml_dtypes==0.5.4 +opentelemetry-sdk==1.39.0 +sglang==0.5.2 +xformers==0.0.27.post2 +lm-format-enforcer==0.10.6 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +pytest==9.0.2 +psutil==7.1.3 +cupy-cuda12x==13.6.0 +tqdm==4.67.1 +onnx==1.20.0 +pybind11==3.0.1 +partial-json-parser==0.2.1.1.post7 +nvidia-nccl-cu12==2.20.5 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +torchvision==0.19.0 +MarkupSafe==3.0.3 +opentelemetry-api==1.39.0 +pytz==2025.2 +dnspython==2.8.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx-ir==0.1.12 +torchdata==0.11.0 +Markdown==3.10 +urllib3==2.6.1 +cuda-pathfinder==1.3.3 +nvidia-cuda-cupti-cu12==12.1.105 +httptools==0.7.1 +pyarrow==22.0.0 +opentelemetry-proto==1.39.0 +certifi==2025.11.12 +typer==0.20.0 +python-json-logger==4.0.0 +pillow==12.0.0 +cuda-bindings==13.1.1 +Werkzeug==3.1.4 +mdurl==0.1.2 +vllm==0.6.3 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +build==1.3.0 +fastapi-cli==0.0.16 +tensorboard==2.20.0 +sentencepiece==0.2.1 +yarl==1.22.0 +opencv-fixer==0.2.5 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +nvidia-cusolver-cu12==11.4.5.107 +jiter==0.12.0 +airportsdata==20250909 +nvidia-nvtx-cu12==12.1.105 +markdown-it-py==4.0.0 +torch==2.4.0 +thefuzz==0.22.1 +opencv-python-headless==4.11.0.86 +pycryptodomex==3.23.0 +pexpect==4.9.0 +distro==1.9.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +antlr4-python3-runtime==4.9.3 +peft==0.18.0 +tzdata==2025.2 +accelerate==1.12.0 +watchfiles==1.1.1 +omegaconf==2.3.0 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +openai==1.99.1 +outlines_core==0.2.11 +google-api-core==2.28.1 +llvmlite==0.44.0 +attrs==25.4.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +gguf==0.10.0 +opencv-python==4.12.0.88 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cudnn-frontend==1.16.0 +pluggy==1.6.0 +compressed-tensors==0.11.0 +importlib_metadata==8.7.0 +RapidFuzz==3.14.3 +networkx==3.6.1 +httpcore==1.0.9 +pre_commit==4.5.0 +python-multipart==0.0.20 +rich==14.2.0 +onnxscript==0.3.1 +cbor2==5.7.1 +smmap==5.0.2 +numpy==1.26.4 +opentelemetry-exporter-prometheus==0.60b0 +click==8.2.1 +traitlets==5.14.3 +nvidia-curand-cu12==10.3.2.106 +pyvers==0.1.0 +huggingface-hub==0.36.0 +cfgv==3.5.0 +optree==0.18.0 +anthropic==0.75.0 +email-validator==2.3.0 +tabulate==0.9.0 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +einops==0.8.1 +aiosignal==1.4.0 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +pycountry==24.6.1 +modelscope==1.33.0 +sentry-sdk==2.47.0 +av==16.0.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +blake3==1.0.8 +nvidia-cudnn-cu12==9.1.0.70 +liger_kernel==0.6.4 +wrapt==2.0.1 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +identify==2.6.15 +mistral_common==1.8.6 +codetiming==1.4.0 +nodeenv==1.9.1 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +protobuf==6.33.2 +hydra-core==1.3.2 +absl-py==2.3.1 +tensorboard-data-server==0.7.2 +jsonschema==4.25.1 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +fastapi==0.124.2 +rsa==4.9.1 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +datasets==4.4.1 +prometheus-fastapi-instrumentator==7.1.0 +nvidia-cusparse-cu12==12.1.0.106 +ruff==0.14.8 +mathruler==0.1.0 +pydantic_core==2.41.5 +pyairports==0.0.1 +ipython==9.8.0 +pynvml==13.0.1 +nvidia-cuda-nvrtc-cu12==12.1.105 +filelock==3.20.0 +loguru==0.7.3 +pandas==2.3.3 +msgspec==0.20.0 +uvicorn==0.38.0 +blobfile==3.0.0 +gitdb==4.0.12 +cachetools==6.2.2 +uv==0.9.17 +llguidance==0.7.30 +hf_transfer==0.1.9 +wcwidth==0.2.14 +aiohttp==3.13.2 +qwen-vl-utils==0.0.14 +rich-toolkit==0.17.0 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +soxr==1.0.0 +ray==2.52.1 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pycparser==2.23 +charset-normalizer==3.4.4 +hf-xet==1.2.0 +dill==0.4.0 +tokenizers==0.22.1 +prometheus_client==0.23.1 +google-auth==2.43.0 +pydantic==2.12.5 +nvidia-ml-py==13.590.44 +fastapi-cloud-cli==0.6.0 +flashinfer-python==0.3.1 +orjson==3.11.5 +python-dateutil==2.9.0.post0 +GitPython==3.1.45 +triton==3.0.0 +torchao==0.9.0 +soundfile==0.13.1 +diskcache==5.6.3 +docstring_parser==0.17.0 +anyio==4.12.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +pure_eval==0.2.3 +ninja==1.13.0 +outlines==0.0.46 +wandb==0.23.1 +regex==2025.11.3 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +wheel==0.45.1 +megatron-core==0.13.1 +multidict==6.7.0 +uvloop==0.22.1 +proto-plus==1.26.1 +pylatexenc==2.10 +decorator==5.2.1 +websockets==15.0.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +scipy==1.16.3 +xgrammar==0.1.25 +pybase64==1.4.3 +opentelemetry-semantic-conventions==0.60b0 +pydantic-extra-types==2.10.6 +rignore==0.7.6 +nvidia-cuda-runtime-cu12==12.1.105 +distlib==0.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.1 +autocommand==2.2.2 +typeguard==4.3.0 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +jaraco.functools==4.0.1 +typing_extensions==4.12.2 +jaraco.text==3.12.1 +platformdirs==4.2.2 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +inflect==7.3.1 +tomli==2.0.1 +zipp==3.19.2 +backports.tarfile==1.2.0 +wheel==0.45.1 +packaging==24.2 diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/wandb-metadata.json b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d5a3ad86d925ba3615248e77d3094bf18265b6dc --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T01:21:50.845281Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=44253", + "--object-store-name=/tmp/ray/session_2026-02-01_20-21-00_836157_1537223/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-01_20-21-00_836157_1537223/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=62109", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=52775", + "--gcs-address=172.16.34.29:61367", + "--session-name=session_2026-02-01_20-21-00_836157_1537223", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=2549a1b366f4964ef8f23a40ed67bd8fc06a14a97f8f53cf9a7706d5", + "--startup-token=128", + "--worker-launch-time-ms=1769995275494", + "--node-id=abe8c189092abd663e817fc41a28c5b237c1d0ba9d4dd79fd858d0fd", + "--runtime-env-hash=1830736042" + ], + "program": "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/PeterGriffinJin/Search-R1", + "commit": "598e61bd1d36895726d28a8d06b3a15bed19f5d3" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "184569806848" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "5m4a15ougycb78o8erwj3yecrb7tbwn1" +} \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-core.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f49edc5ee7edb53bbb72a5b046b7880bac1fac64 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-01T20:21:51.061260937-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpai5lb1ap/port-1545881.txt","pid":1545881,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-01T20:21:51.062200689-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1545881} +{"time":"2026-02-01T20:21:51.062207345-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1545881-1551236-3300304792/socket","Net":"unix"}} +{"time":"2026-02-01T20:21:51.22693701-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-01T20:21:51.242673041-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"t68srmo7","id":"1(@)"} +{"time":"2026-02-01T20:21:52.263251526-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"t68srmo7","id":"1(@)"} +{"time":"2026-02-01T20:22:38.856926907-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6eb28f83e680c11c42634386c2768b87bd56582b --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-01T20:21:51.244466362-05:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2026-02-01T20:21:52.25966277-05:00","level":"INFO","msg":"stream: created new stream","id":"t68srmo7"} +{"time":"2026-02-01T20:21:52.259825063-05:00","level":"INFO","msg":"handler: started","stream_id":"t68srmo7"} +{"time":"2026-02-01T20:21:52.26322258-05:00","level":"INFO","msg":"stream: started","id":"t68srmo7"} +{"time":"2026-02-01T20:21:52.263274117-05:00","level":"INFO","msg":"writer: started","stream_id":"t68srmo7"} +{"time":"2026-02-01T20:21:52.263304682-05:00","level":"INFO","msg":"sender: started","stream_id":"t68srmo7"} diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b14f79f089787cd6d77dc9ca89080e44131ed562 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log @@ -0,0 +1,21 @@ +2026-02-01 20:21:50,865 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Configure stats pid to 1545881 +2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Loading settings from /home/mshahidul/.config/wandb/settings +2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Loading settings from /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/settings +2026-02-01 20:21:50,866 INFO MainThread:1545881 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-02-01 20:21:50,867 INFO MainThread:1545881 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug.log +2026-02-01 20:21:50,867 INFO MainThread:1545881 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/logs/debug-internal.log +2026-02-01 20:21:50,868 INFO MainThread:1545881 [wandb_init.py:init():841] calling init triggers +2026-02-01 20:21:50,868 INFO MainThread:1545881 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'data': {'tokenizer': None, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet', 'train_data_num': None, 'val_data_num': None, 'prompt_key': 'prompt', 'max_prompt_length': 4096, 'max_response_length': 1024, 'max_start_length': 256, 'max_obs_length': 512, 'train_batch_size': 128, 'val_batch_size': 64, 'return_raw_input_ids': False, 'return_raw_chat': False, 'shuffle_train_dataloader': True}, 'actor_rollout_ref': {'hybrid_engine': True, 'model': {'path': 'Qwen/Qwen3-4B-Instruct-2507', 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'use_remove_padding': True}, 'actor': {'strategy': 'fsdp', 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'grad_clip': 1.0, 'state_masking': False, 'clip_ratio': 0.2, 'entropy_coeff': 0.001, 'use_kl_loss': False, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'ulysses_sequence_parallel_size': 1, 'optim': {'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'fsdp_config': {'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'grad_offload': False, 'optimizer_offload': True, 'fsdp_size': -1}, 'ppo_micro_batch_size_per_gpu': 16}, 'ref': {'fsdp_config': {'param_offload': True, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'ulysses_sequence_parallel_size': 1}, 'rollout': {'name': 'vllm', 'temperature': 1.0, 'top_k': -1, 'top_p': 0.95, 'prompt_length': 4096, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'free_cache_engine': True, 'load_format': 'dummy_dtensor', 'tensor_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_num_seqs': 1024, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'do_sample': True, 'n': 1, 'n_agent': 1}}, 'critic': {'strategy': 'fsdp', 'optim': {'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'model': {'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'enable_gradient_checkpointing': False, 'use_remove_padding': False, 'fsdp_config': {'param_offload': False, 'grad_offload': False, 'optimizer_offload': False, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}}, 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'forward_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ulysses_sequence_parallel_size': 1, 'ppo_epochs': 1, 'shuffle': False, 'grad_clip': 1.0, 'cliprange_value': 0.5}, 'reward_model': {'enable': False, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'use_remove_padding': False, 'fsdp_config': {'min_num_params': 0, 'param_offload': False}}, 'micro_batch_size': 64, 'max_length': None, 'ulysses_sequence_parallel_size': 1, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'structure_format_score': 0, 'final_format_score': 0, 'retrieval_score': 0}, 'retriever': {'url': 'http://127.0.0.1:8000/retrieve', 'topk': 3}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'no_think_rl': False, 'kl_penalty': 'kl', 'kl_ctrl': {'type': 'fixed', 'kl_coef': 0.001}, 'state_masking': {'start_state_marker': '', 'end_state_marker': ''}}, 'trainer': {'total_epochs': 15, 'total_training_steps': 1005, 'project_name': '', 'experiment_name': 'llm_guard_3B_10k_v2', 'logger': ['wandb'], 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'test_freq': 50, 'critic_warmup': 0, 'default_hdfs_dir': '~/experiments/gsm8k/ppo/llm_guard_3B_10k_v2', 'default_local_dir': 'verl_checkpoints/llm_guard_3B_10k_v2'}, 'max_turns': 1, 'do_search': False, '_wandb': {}} +2026-02-01 20:21:50,868 INFO MainThread:1545881 [wandb_init.py:init():889] starting backend +2026-02-01 20:21:51,227 INFO MainThread:1545881 [wandb_init.py:init():892] sending inform_init request +2026-02-01 20:21:51,235 INFO MainThread:1545881 [wandb_init.py:init():900] backend started and connected +2026-02-01 20:21:51,244 INFO MainThread:1545881 [wandb_init.py:init():970] updated telemetry +2026-02-01 20:21:51,270 INFO MainThread:1545881 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-02-01 20:21:52,622 INFO MainThread:1545881 [wandb_init.py:init():1041] starting run threads in backend +2026-02-01 20:21:53,469 INFO MainThread:1545881 [wandb_run.py:_console_start():2521] atexit reg +2026-02-01 20:21:53,469 INFO MainThread:1545881 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-02-01 20:21:53,469 INFO MainThread:1545881 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-02-01 20:21:53,470 INFO MainThread:1545881 [wandb_run.py:_redirect():2461] Redirects installed. +2026-02-01 20:21:53,481 INFO MainThread:1545881 [wandb_init.py:init():1081] run started, returning control to user process diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/run-t68srmo7.wandb b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/run-t68srmo7.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/code/RL_model/verl/Search-R1/wandb/run-20260201_202150-t68srmo7/run-t68srmo7.wandb differ diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/output.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..224e7b99f95aed5c56df55343d05d4b6dfe0021f --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/output.log @@ -0,0 +1,20 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.ref_init_model() (pid=1569209, ip=172.16.34.29, actor_id=30bbe065b8ff586d669f4f9101000000, repr=) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py", line 399, in func + return getattr(self.worker_dict[key], name)(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 335, in init_model + self.ref_module_fsdp = self._build_model_optimizer(model_path=self.config.model.path, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 143, in _build_model_optimizer + check_model_support_rmpad(actor_model_config.model_type) + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/models/registry.py", line 30, in check_model_support_rmpad + raise ValueError(f"Model architecture {model_type} is not supported for now. " +ValueError: Model architecture qwen3 is not supported for now. RMPad supported architectures: dict_keys(['llama', 'mistral', 'gemma', 'qwen2']).Please set `use_remove_padding=False` in the model config. diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/requirements.txt b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dd689031142d7d6d129a52ef2a92f2070eb5c06 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/requirements.txt @@ -0,0 +1,288 @@ +verl==0.1 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +pyasn1==0.6.1 +virtualenv==20.35.4 +requests==2.32.5 +nvidia-cufft-cu12==11.0.2.54 +nvidia-cufile-cu12==1.13.1.3 +verl==0.1 +ml_dtypes==0.5.4 +opentelemetry-sdk==1.39.0 +sglang==0.5.2 +xformers==0.0.27.post2 +lm-format-enforcer==0.10.6 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +pytest==9.0.2 +psutil==7.1.3 +cupy-cuda12x==13.6.0 +tqdm==4.67.1 +onnx==1.20.0 +pybind11==3.0.1 +partial-json-parser==0.2.1.1.post7 +nvidia-nccl-cu12==2.20.5 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +torchvision==0.19.0 +MarkupSafe==3.0.3 +opentelemetry-api==1.39.0 +pytz==2025.2 +dnspython==2.8.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx-ir==0.1.12 +torchdata==0.11.0 +Markdown==3.10 +urllib3==2.6.1 +cuda-pathfinder==1.3.3 +nvidia-cuda-cupti-cu12==12.1.105 +httptools==0.7.1 +pyarrow==22.0.0 +opentelemetry-proto==1.39.0 +certifi==2025.11.12 +typer==0.20.0 +python-json-logger==4.0.0 +pillow==12.0.0 +cuda-bindings==13.1.1 +Werkzeug==3.1.4 +mdurl==0.1.2 +vllm==0.6.3 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +build==1.3.0 +fastapi-cli==0.0.16 +tensorboard==2.20.0 +sentencepiece==0.2.1 +flash_attn==2.8.3 +yarl==1.22.0 +opencv-fixer==0.2.5 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +nvidia-cusolver-cu12==11.4.5.107 +jiter==0.12.0 +airportsdata==20250909 +nvidia-nvtx-cu12==12.1.105 +markdown-it-py==4.0.0 +torch==2.4.0 +thefuzz==0.22.1 +opencv-python-headless==4.11.0.86 +pycryptodomex==3.23.0 +pexpect==4.9.0 +distro==1.9.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +antlr4-python3-runtime==4.9.3 +peft==0.18.0 +tzdata==2025.2 +accelerate==1.12.0 +watchfiles==1.1.1 +omegaconf==2.3.0 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +openai==1.99.1 +outlines_core==0.2.11 +google-api-core==2.28.1 +llvmlite==0.44.0 +attrs==25.4.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +gguf==0.10.0 +opencv-python==4.12.0.88 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cudnn-frontend==1.16.0 +pluggy==1.6.0 +compressed-tensors==0.11.0 +importlib_metadata==8.7.0 +RapidFuzz==3.14.3 +networkx==3.6.1 +httpcore==1.0.9 +pre_commit==4.5.0 +python-multipart==0.0.20 +rich==14.2.0 +onnxscript==0.3.1 +cbor2==5.7.1 +smmap==5.0.2 +numpy==1.26.4 +opentelemetry-exporter-prometheus==0.60b0 +click==8.2.1 +traitlets==5.14.3 +nvidia-curand-cu12==10.3.2.106 +pyvers==0.1.0 +huggingface-hub==0.36.0 +cfgv==3.5.0 +optree==0.18.0 +anthropic==0.75.0 +email-validator==2.3.0 +tabulate==0.9.0 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +einops==0.8.1 +aiosignal==1.4.0 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +pycountry==24.6.1 +modelscope==1.33.0 +sentry-sdk==2.47.0 +av==16.0.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +blake3==1.0.8 +nvidia-cudnn-cu12==9.1.0.70 +liger_kernel==0.6.4 +wrapt==2.0.1 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +identify==2.6.15 +mistral_common==1.8.6 +codetiming==1.4.0 +nodeenv==1.9.1 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +protobuf==6.33.2 +hydra-core==1.3.2 +absl-py==2.3.1 +tensorboard-data-server==0.7.2 +jsonschema==4.25.1 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +fastapi==0.124.2 +rsa==4.9.1 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +datasets==4.4.1 +prometheus-fastapi-instrumentator==7.1.0 +nvidia-cusparse-cu12==12.1.0.106 +ruff==0.14.8 +mathruler==0.1.0 +pydantic_core==2.41.5 +pyairports==0.0.1 +ipython==9.8.0 +pynvml==13.0.1 +nvidia-cuda-nvrtc-cu12==12.1.105 +filelock==3.20.0 +loguru==0.7.3 +pandas==2.3.3 +msgspec==0.20.0 +uvicorn==0.38.0 +blobfile==3.0.0 +gitdb==4.0.12 +cachetools==6.2.2 +uv==0.9.17 +llguidance==0.7.30 +hf_transfer==0.1.9 +wcwidth==0.2.14 +aiohttp==3.13.2 +qwen-vl-utils==0.0.14 +rich-toolkit==0.17.0 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +soxr==1.0.0 +ray==2.52.1 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pycparser==2.23 +charset-normalizer==3.4.4 +hf-xet==1.2.0 +dill==0.4.0 +tokenizers==0.22.1 +prometheus_client==0.23.1 +google-auth==2.43.0 +pydantic==2.12.5 +nvidia-ml-py==13.590.44 +fastapi-cloud-cli==0.6.0 +flashinfer-python==0.3.1 +orjson==3.11.5 +python-dateutil==2.9.0.post0 +GitPython==3.1.45 +triton==3.0.0 +torchao==0.9.0 +soundfile==0.13.1 +diskcache==5.6.3 +docstring_parser==0.17.0 +anyio==4.12.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +pure_eval==0.2.3 +ninja==1.13.0 +outlines==0.0.46 +wandb==0.23.1 +regex==2025.11.3 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +wheel==0.45.1 +megatron-core==0.13.1 +multidict==6.7.0 +uvloop==0.22.1 +proto-plus==1.26.1 +pylatexenc==2.10 +decorator==5.2.1 +websockets==15.0.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +scipy==1.16.3 +xgrammar==0.1.25 +pybase64==1.4.3 +opentelemetry-semantic-conventions==0.60b0 +pydantic-extra-types==2.10.6 +rignore==0.7.6 +nvidia-cuda-runtime-cu12==12.1.105 +distlib==0.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.1 +autocommand==2.2.2 +typeguard==4.3.0 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +jaraco.functools==4.0.1 +typing_extensions==4.12.2 +jaraco.text==3.12.1 +platformdirs==4.2.2 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +inflect==7.3.1 +tomli==2.0.1 +zipp==3.19.2 +backports.tarfile==1.2.0 +wheel==0.45.1 +packaging==24.2 diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/wandb-metadata.json b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..0ab5bffd7a3a1fe7b8138b54a8900afd686bba50 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T01:24:56.695929Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=35889", + "--object-store-name=/tmp/ray/session_2026-02-01_20-24-16_851383_1554534/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-01_20-24-16_851383_1554534/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=60951", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=63006", + "--gcs-address=172.16.34.29:62587", + "--session-name=session_2026-02-01_20-24-16_851383_1554534", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=2a6a92c2ce2c7497d2c570ee4ed306ca282172c9fb3a948d0d91f16a", + "--startup-token=128", + "--worker-launch-time-ms=1769995470827", + "--node-id=8646fde502441bfcc43d5303c4610d10bdcd65e2ec1b75c5626051ce", + "--runtime-env-hash=1830736042" + ], + "program": "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/PeterGriffinJin/Search-R1", + "commit": "598e61bd1d36895726d28a8d06b3a15bed19f5d3" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "184573485056" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "vtydy8v1vqlfqdr1gmygsbjfgg784jf6" +} \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-core.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..01035e4956719aba11e232284dad8bcd06699791 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-01T20:24:56.909555274-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpoqvy8trh/port-1562972.txt","pid":1562972,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-01T20:24:56.910726586-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1562972} +{"time":"2026-02-01T20:24:56.91070461-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1562972-1568004-2806483583/socket","Net":"unix"}} +{"time":"2026-02-01T20:24:57.077884409-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-01T20:24:57.090876423-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"lmw144t2","id":"1(@)"} +{"time":"2026-02-01T20:24:58.917447274-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"lmw144t2","id":"1(@)"} +{"time":"2026-02-01T20:25:47.319167155-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..1062a599f72886f79baca3122e696d65c44cecbd --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-01T20:24:57.092208445-05:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2026-02-01T20:24:58.913739618-05:00","level":"INFO","msg":"stream: created new stream","id":"lmw144t2"} +{"time":"2026-02-01T20:24:58.913897747-05:00","level":"INFO","msg":"handler: started","stream_id":"lmw144t2"} +{"time":"2026-02-01T20:24:58.91742272-05:00","level":"INFO","msg":"stream: started","id":"lmw144t2"} +{"time":"2026-02-01T20:24:58.917508674-05:00","level":"INFO","msg":"writer: started","stream_id":"lmw144t2"} +{"time":"2026-02-01T20:24:58.91751844-05:00","level":"INFO","msg":"sender: started","stream_id":"lmw144t2"} diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d18fa765e5d66f927fd81e9e5bf17d79c139c003 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log @@ -0,0 +1,21 @@ +2026-02-01 20:24:56,717 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-02-01 20:24:56,717 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Configure stats pid to 1562972 +2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Loading settings from /home/mshahidul/.config/wandb/settings +2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Loading settings from /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/settings +2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug.log +2026-02-01 20:24:56,718 INFO MainThread:1562972 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/logs/debug-internal.log +2026-02-01 20:24:56,719 INFO MainThread:1562972 [wandb_init.py:init():841] calling init triggers +2026-02-01 20:24:56,719 INFO MainThread:1562972 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'data': {'tokenizer': None, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet', 'train_data_num': None, 'val_data_num': None, 'prompt_key': 'prompt', 'max_prompt_length': 4096, 'max_response_length': 1024, 'max_start_length': 256, 'max_obs_length': 512, 'train_batch_size': 128, 'val_batch_size': 64, 'return_raw_input_ids': False, 'return_raw_chat': False, 'shuffle_train_dataloader': True}, 'actor_rollout_ref': {'hybrid_engine': True, 'model': {'path': 'Qwen/Qwen3-4B-Instruct-2507', 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'use_remove_padding': True}, 'actor': {'strategy': 'fsdp', 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'grad_clip': 1.0, 'state_masking': False, 'clip_ratio': 0.2, 'entropy_coeff': 0.001, 'use_kl_loss': False, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'ulysses_sequence_parallel_size': 1, 'optim': {'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'fsdp_config': {'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'grad_offload': False, 'optimizer_offload': True, 'fsdp_size': -1}, 'ppo_micro_batch_size_per_gpu': 16}, 'ref': {'fsdp_config': {'param_offload': True, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'ulysses_sequence_parallel_size': 1}, 'rollout': {'name': 'vllm', 'temperature': 1.0, 'top_k': -1, 'top_p': 0.95, 'prompt_length': 4096, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'free_cache_engine': True, 'load_format': 'dummy_dtensor', 'tensor_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_num_seqs': 1024, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'do_sample': True, 'n': 1, 'n_agent': 1}}, 'critic': {'strategy': 'fsdp', 'optim': {'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'model': {'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'enable_gradient_checkpointing': False, 'use_remove_padding': False, 'fsdp_config': {'param_offload': False, 'grad_offload': False, 'optimizer_offload': False, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}}, 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'forward_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ulysses_sequence_parallel_size': 1, 'ppo_epochs': 1, 'shuffle': False, 'grad_clip': 1.0, 'cliprange_value': 0.5}, 'reward_model': {'enable': False, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'use_remove_padding': False, 'fsdp_config': {'min_num_params': 0, 'param_offload': False}}, 'micro_batch_size': 64, 'max_length': None, 'ulysses_sequence_parallel_size': 1, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'structure_format_score': 0, 'final_format_score': 0, 'retrieval_score': 0}, 'retriever': {'url': 'http://127.0.0.1:8000/retrieve', 'topk': 3}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'no_think_rl': False, 'kl_penalty': 'kl', 'kl_ctrl': {'type': 'fixed', 'kl_coef': 0.001}, 'state_masking': {'start_state_marker': '', 'end_state_marker': ''}}, 'trainer': {'total_epochs': 15, 'total_training_steps': 1005, 'project_name': '', 'experiment_name': 'llm_guard_3B_10k_v2', 'logger': ['wandb'], 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'test_freq': 50, 'critic_warmup': 0, 'default_hdfs_dir': '~/experiments/gsm8k/ppo/llm_guard_3B_10k_v2', 'default_local_dir': 'verl_checkpoints/llm_guard_3B_10k_v2'}, 'max_turns': 1, 'do_search': False, '_wandb': {}} +2026-02-01 20:24:56,719 INFO MainThread:1562972 [wandb_init.py:init():889] starting backend +2026-02-01 20:24:57,078 INFO MainThread:1562972 [wandb_init.py:init():892] sending inform_init request +2026-02-01 20:24:57,086 INFO MainThread:1562972 [wandb_init.py:init():900] backend started and connected +2026-02-01 20:24:57,092 INFO MainThread:1562972 [wandb_init.py:init():970] updated telemetry +2026-02-01 20:24:57,114 INFO MainThread:1562972 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-02-01 20:24:59,139 INFO MainThread:1562972 [wandb_init.py:init():1041] starting run threads in backend +2026-02-01 20:24:59,972 INFO MainThread:1562972 [wandb_run.py:_console_start():2521] atexit reg +2026-02-01 20:24:59,973 INFO MainThread:1562972 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-02-01 20:24:59,973 INFO MainThread:1562972 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-02-01 20:24:59,973 INFO MainThread:1562972 [wandb_run.py:_redirect():2461] Redirects installed. +2026-02-01 20:24:59,985 INFO MainThread:1562972 [wandb_init.py:init():1081] run started, returning control to user process diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/run-lmw144t2.wandb b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/run-lmw144t2.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/code/RL_model/verl/Search-R1/wandb/run-20260201_202456-lmw144t2/run-lmw144t2.wandb differ diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/output.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b0055b3731d82dd99f43f1be90744402fc3ac222 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/output.log @@ -0,0 +1,53 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::WorkerDict.actor_rollout_init_model() (pid=1584886, ip=172.16.34.29, actor_id=73dee5b169bd353a8f66401d01000000, repr=) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py", line 399, in func + return getattr(self.worker_dict[key], name)(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py", line 404, in inner + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 332, in init_model + self.rollout, self.rollout_sharding_manager = self._build_rollout() + ^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py", line 268, in _build_rollout + rollout = vLLMRollout(actor_module=self.actor_module_fsdp, + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/vllm_rollout.py", line 91, in __init__ + self.inference_engine = LLM(actor_module, + ^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm.py", line 142, in __init__ + self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args) # TODO: check usagecontext + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm_engine_sp.py", line 382, in from_engine_args + engine_config = engine_args.create_engine_config() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py", line 71, in create_engine_config + engine_config = super().create_engine_config() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/engine/arg_utils.py", line 900, in create_engine_config + model_config = self.create_model_config() + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py", line 34, in create_model_config + return ModelConfig( + ^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/config.py", line 47, in __init__ + super().__init__(model=hf_config._name_or_path, tokenizer=hf_config._name_or_path, *args, **kwargs) + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/config.py", line 194, in __init__ + self.multimodal_config = self._init_multimodal_config( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/config.py", line 213, in _init_multimodal_config + if ModelRegistry.is_multimodal_model(architectures): + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/model_executor/models/registry.py", line 384, in is_multimodal_model + return self.inspect_model_cls(architectures).supports_multimodal + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/model_executor/models/registry.py", line 353, in inspect_model_cls + return self._raise_for_unsupported(architectures) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/model_executor/models/registry.py", line 314, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures ['Qwen3ForCausalLM'] are not supported for now. Supported architectures: ['AquilaModel', 'AquilaForCausalLM', 'ArcticForCausalLM', 'BaiChuanForCausalLM', 'BaichuanForCausalLM', 'BloomForCausalLM', 'CohereForCausalLM', 'DbrxForCausalLM', 'DeciLMForCausalLM', 'DeepseekForCausalLM', 'DeepseekV2ForCausalLM', 'ExaoneForCausalLM', 'FalconForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTJForCausalLM', 'GPTNeoXForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'InternLMForCausalLM', 'InternLM2ForCausalLM', 'JAISLMHeadModel', 'JambaForCausalLM', 'LlamaForCausalLM', 'LLaMAForCausalLM', 'MambaForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'QuantMixtralForCausalLM', 'MptForCausalLM', 'MPTForCausalLM', 'MiniCPMForCausalLM', 'MiniCPM3ForCausalLM', 'NemotronForCausalLM', 'OlmoForCausalLM', 'OlmoeForCausalLM', 'OPTForCausalLM', 'OrionForCausalLM', 'PersimmonForCausalLM', 'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3SmallForCausalLM', 'PhiMoEForCausalLM', 'Qwen2ForCausalLM', 'Qwen2MoeForCausalLM', 'RWForCausalLM', 'StableLMEpochForCausalLM', 'StableLmForCausalLM', 'Starcoder2ForCausalLM', 'SolarForCausalLM', 'XverseForCausalLM', 'BartModel', 'BartForConditionalGeneration', 'MistralModel', 'Qwen2ForRewardModel', 'Gemma2Model', 'Blip2ForConditionalGeneration', 'ChameleonForConditionalGeneration', 'ChatGLMModel', 'ChatGLMForConditionalGeneration', 'FuyuForCausalLM', 'InternVLChatModel', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration', 'LlavaNextVideoForConditionalGeneration', 'LlavaOnevisionForConditionalGeneration', 'MiniCPMV', 'MolmoForCausalLM', 'NVLM_D', 'PaliGemmaForConditionalGeneration', 'Phi3VForCausalLM', 'PixtralForConditionalGeneration', 'QWenLMHeadModel', 'Qwen2VLForConditionalGeneration', 'UltravoxModel', 'MllamaForConditionalGeneration', 'EAGLEModel', 'MedusaModel', 'MLPSpeculatorPreTrainedModel'] diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/requirements.txt b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dd689031142d7d6d129a52ef2a92f2070eb5c06 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/requirements.txt @@ -0,0 +1,288 @@ +verl==0.1 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +pyasn1==0.6.1 +virtualenv==20.35.4 +requests==2.32.5 +nvidia-cufft-cu12==11.0.2.54 +nvidia-cufile-cu12==1.13.1.3 +verl==0.1 +ml_dtypes==0.5.4 +opentelemetry-sdk==1.39.0 +sglang==0.5.2 +xformers==0.0.27.post2 +lm-format-enforcer==0.10.6 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +pytest==9.0.2 +psutil==7.1.3 +cupy-cuda12x==13.6.0 +tqdm==4.67.1 +onnx==1.20.0 +pybind11==3.0.1 +partial-json-parser==0.2.1.1.post7 +nvidia-nccl-cu12==2.20.5 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +torchvision==0.19.0 +MarkupSafe==3.0.3 +opentelemetry-api==1.39.0 +pytz==2025.2 +dnspython==2.8.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx-ir==0.1.12 +torchdata==0.11.0 +Markdown==3.10 +urllib3==2.6.1 +cuda-pathfinder==1.3.3 +nvidia-cuda-cupti-cu12==12.1.105 +httptools==0.7.1 +pyarrow==22.0.0 +opentelemetry-proto==1.39.0 +certifi==2025.11.12 +typer==0.20.0 +python-json-logger==4.0.0 +pillow==12.0.0 +cuda-bindings==13.1.1 +Werkzeug==3.1.4 +mdurl==0.1.2 +vllm==0.6.3 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +build==1.3.0 +fastapi-cli==0.0.16 +tensorboard==2.20.0 +sentencepiece==0.2.1 +flash_attn==2.8.3 +yarl==1.22.0 +opencv-fixer==0.2.5 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +nvidia-cusolver-cu12==11.4.5.107 +jiter==0.12.0 +airportsdata==20250909 +nvidia-nvtx-cu12==12.1.105 +markdown-it-py==4.0.0 +torch==2.4.0 +thefuzz==0.22.1 +opencv-python-headless==4.11.0.86 +pycryptodomex==3.23.0 +pexpect==4.9.0 +distro==1.9.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +antlr4-python3-runtime==4.9.3 +peft==0.18.0 +tzdata==2025.2 +accelerate==1.12.0 +watchfiles==1.1.1 +omegaconf==2.3.0 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +openai==1.99.1 +outlines_core==0.2.11 +google-api-core==2.28.1 +llvmlite==0.44.0 +attrs==25.4.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +gguf==0.10.0 +opencv-python==4.12.0.88 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cudnn-frontend==1.16.0 +pluggy==1.6.0 +compressed-tensors==0.11.0 +importlib_metadata==8.7.0 +RapidFuzz==3.14.3 +networkx==3.6.1 +httpcore==1.0.9 +pre_commit==4.5.0 +python-multipart==0.0.20 +rich==14.2.0 +onnxscript==0.3.1 +cbor2==5.7.1 +smmap==5.0.2 +numpy==1.26.4 +opentelemetry-exporter-prometheus==0.60b0 +click==8.2.1 +traitlets==5.14.3 +nvidia-curand-cu12==10.3.2.106 +pyvers==0.1.0 +huggingface-hub==0.36.0 +cfgv==3.5.0 +optree==0.18.0 +anthropic==0.75.0 +email-validator==2.3.0 +tabulate==0.9.0 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +einops==0.8.1 +aiosignal==1.4.0 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +pycountry==24.6.1 +modelscope==1.33.0 +sentry-sdk==2.47.0 +av==16.0.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +blake3==1.0.8 +nvidia-cudnn-cu12==9.1.0.70 +liger_kernel==0.6.4 +wrapt==2.0.1 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +identify==2.6.15 +mistral_common==1.8.6 +codetiming==1.4.0 +nodeenv==1.9.1 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +protobuf==6.33.2 +hydra-core==1.3.2 +absl-py==2.3.1 +tensorboard-data-server==0.7.2 +jsonschema==4.25.1 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +fastapi==0.124.2 +rsa==4.9.1 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +datasets==4.4.1 +prometheus-fastapi-instrumentator==7.1.0 +nvidia-cusparse-cu12==12.1.0.106 +ruff==0.14.8 +mathruler==0.1.0 +pydantic_core==2.41.5 +pyairports==0.0.1 +ipython==9.8.0 +pynvml==13.0.1 +nvidia-cuda-nvrtc-cu12==12.1.105 +filelock==3.20.0 +loguru==0.7.3 +pandas==2.3.3 +msgspec==0.20.0 +uvicorn==0.38.0 +blobfile==3.0.0 +gitdb==4.0.12 +cachetools==6.2.2 +uv==0.9.17 +llguidance==0.7.30 +hf_transfer==0.1.9 +wcwidth==0.2.14 +aiohttp==3.13.2 +qwen-vl-utils==0.0.14 +rich-toolkit==0.17.0 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +soxr==1.0.0 +ray==2.52.1 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pycparser==2.23 +charset-normalizer==3.4.4 +hf-xet==1.2.0 +dill==0.4.0 +tokenizers==0.22.1 +prometheus_client==0.23.1 +google-auth==2.43.0 +pydantic==2.12.5 +nvidia-ml-py==13.590.44 +fastapi-cloud-cli==0.6.0 +flashinfer-python==0.3.1 +orjson==3.11.5 +python-dateutil==2.9.0.post0 +GitPython==3.1.45 +triton==3.0.0 +torchao==0.9.0 +soundfile==0.13.1 +diskcache==5.6.3 +docstring_parser==0.17.0 +anyio==4.12.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +pure_eval==0.2.3 +ninja==1.13.0 +outlines==0.0.46 +wandb==0.23.1 +regex==2025.11.3 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +wheel==0.45.1 +megatron-core==0.13.1 +multidict==6.7.0 +uvloop==0.22.1 +proto-plus==1.26.1 +pylatexenc==2.10 +decorator==5.2.1 +websockets==15.0.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +scipy==1.16.3 +xgrammar==0.1.25 +pybase64==1.4.3 +opentelemetry-semantic-conventions==0.60b0 +pydantic-extra-types==2.10.6 +rignore==0.7.6 +nvidia-cuda-runtime-cu12==12.1.105 +distlib==0.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.1 +autocommand==2.2.2 +typeguard==4.3.0 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +jaraco.functools==4.0.1 +typing_extensions==4.12.2 +jaraco.text==3.12.1 +platformdirs==4.2.2 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +inflect==7.3.1 +tomli==2.0.1 +zipp==3.19.2 +backports.tarfile==1.2.0 +wheel==0.45.1 +packaging==24.2 diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/wandb-metadata.json b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8cf15a6570ef487e4a89d88ebc3824111f0f9fdf --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T01:27:25.847668Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=42463", + "--object-store-name=/tmp/ray/session_2026-02-01_20-26-45_640792_1570462/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-01_20-26-45_640792_1570462/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=52627", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=50340", + "--gcs-address=172.16.34.29:59784", + "--session-name=session_2026-02-01_20-26-45_640792_1570462", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=aea41d36cae02be0d21d983bc0d205680cba8b3ba963a20925793ff7", + "--startup-token=128", + "--worker-launch-time-ms=1769995620433", + "--node-id=43683b8f20fc380a586055e760eed6ea68a97aefc99ec5eb0e1aaf3e", + "--runtime-env-hash=1830736042" + ], + "program": "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/PeterGriffinJin/Search-R1", + "commit": "598e61bd1d36895726d28a8d06b3a15bed19f5d3" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "184577196032" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "6aeyut6ybrvvbk4fszgmutrk1al0827k" +} \ No newline at end of file diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-core.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..999c4ddf8a791081b9d41c2e39eb6bf37be29d3e --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-01T20:27:26.085930508-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpf6067hxz/port-1578907.txt","pid":1578907,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-01T20:27:26.089468105-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1578907} +{"time":"2026-02-01T20:27:26.08946884-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1578907-1584383-2031627458/socket","Net":"unix"}} +{"time":"2026-02-01T20:27:26.251033877-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-01T20:27:26.267228916-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"lly0j9zs","id":"1(@)"} +{"time":"2026-02-01T20:27:27.695521129-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"lly0j9zs","id":"1(@)"} +{"time":"2026-02-01T20:28:55.044327673-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..303163b4a3ed9a27addcc89f564458d66d92cea4 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-01T20:27:26.269116545-05:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2026-02-01T20:27:27.692526697-05:00","level":"INFO","msg":"stream: created new stream","id":"lly0j9zs"} +{"time":"2026-02-01T20:27:27.692680073-05:00","level":"INFO","msg":"handler: started","stream_id":"lly0j9zs"} +{"time":"2026-02-01T20:27:27.695494454-05:00","level":"INFO","msg":"stream: started","id":"lly0j9zs"} +{"time":"2026-02-01T20:27:27.69557747-05:00","level":"INFO","msg":"writer: started","stream_id":"lly0j9zs"} +{"time":"2026-02-01T20:27:27.695701035-05:00","level":"INFO","msg":"sender: started","stream_id":"lly0j9zs"} diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8df0f98e5da5d448c64dbafc1ef3703811880cd5 --- /dev/null +++ b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log @@ -0,0 +1,21 @@ +2026-02-01 20:27:25,874 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2026-02-01 20:27:25,874 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Configure stats pid to 1578907 +2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from /home/mshahidul/.config/wandb/settings +2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/settings +2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from environment variables +2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log +2026-02-01 20:27:25,875 INFO MainThread:1578907 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log +2026-02-01 20:27:25,876 INFO MainThread:1578907 [wandb_init.py:init():841] calling init triggers +2026-02-01 20:27:25,876 INFO MainThread:1578907 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'data': {'tokenizer': None, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet', 'train_data_num': None, 'val_data_num': None, 'prompt_key': 'prompt', 'max_prompt_length': 4096, 'max_response_length': 1024, 'max_start_length': 256, 'max_obs_length': 512, 'train_batch_size': 128, 'val_batch_size': 64, 'return_raw_input_ids': False, 'return_raw_chat': False, 'shuffle_train_dataloader': True}, 'actor_rollout_ref': {'hybrid_engine': True, 'model': {'path': 'Qwen/Qwen3-4B-Instruct-2507', 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'use_remove_padding': False}, 'actor': {'strategy': 'fsdp', 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'grad_clip': 1.0, 'state_masking': False, 'clip_ratio': 0.2, 'entropy_coeff': 0.001, 'use_kl_loss': False, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'ulysses_sequence_parallel_size': 1, 'optim': {'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'fsdp_config': {'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'grad_offload': False, 'optimizer_offload': True, 'fsdp_size': -1}, 'ppo_micro_batch_size_per_gpu': 16}, 'ref': {'fsdp_config': {'param_offload': True, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'ulysses_sequence_parallel_size': 1}, 'rollout': {'name': 'vllm', 'temperature': 1.0, 'top_k': -1, 'top_p': 0.95, 'prompt_length': 4096, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'free_cache_engine': True, 'load_format': 'dummy_dtensor', 'tensor_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_num_seqs': 1024, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'do_sample': True, 'n': 1, 'n_agent': 1}}, 'critic': {'strategy': 'fsdp', 'optim': {'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'model': {'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'enable_gradient_checkpointing': False, 'use_remove_padding': False, 'fsdp_config': {'param_offload': False, 'grad_offload': False, 'optimizer_offload': False, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}}, 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'forward_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ulysses_sequence_parallel_size': 1, 'ppo_epochs': 1, 'shuffle': False, 'grad_clip': 1.0, 'cliprange_value': 0.5}, 'reward_model': {'enable': False, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'use_remove_padding': False, 'fsdp_config': {'min_num_params': 0, 'param_offload': False}}, 'micro_batch_size': 64, 'max_length': None, 'ulysses_sequence_parallel_size': 1, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'structure_format_score': 0, 'final_format_score': 0, 'retrieval_score': 0}, 'retriever': {'url': 'http://127.0.0.1:8000/retrieve', 'topk': 3}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'no_think_rl': False, 'kl_penalty': 'kl', 'kl_ctrl': {'type': 'fixed', 'kl_coef': 0.001}, 'state_masking': {'start_state_marker': '', 'end_state_marker': ''}}, 'trainer': {'total_epochs': 15, 'total_training_steps': 1005, 'project_name': '', 'experiment_name': 'llm_guard_3B_10k_v2', 'logger': ['wandb'], 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'test_freq': 50, 'critic_warmup': 0, 'default_hdfs_dir': '~/experiments/gsm8k/ppo/llm_guard_3B_10k_v2', 'default_local_dir': 'verl_checkpoints/llm_guard_3B_10k_v2'}, 'max_turns': 1, 'do_search': False, '_wandb': {}} +2026-02-01 20:27:25,876 INFO MainThread:1578907 [wandb_init.py:init():889] starting backend +2026-02-01 20:27:26,251 INFO MainThread:1578907 [wandb_init.py:init():892] sending inform_init request +2026-02-01 20:27:26,261 INFO MainThread:1578907 [wandb_init.py:init():900] backend started and connected +2026-02-01 20:27:26,270 INFO MainThread:1578907 [wandb_init.py:init():970] updated telemetry +2026-02-01 20:27:26,293 INFO MainThread:1578907 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2026-02-01 20:27:27,908 INFO MainThread:1578907 [wandb_init.py:init():1041] starting run threads in backend +2026-02-01 20:27:28,715 INFO MainThread:1578907 [wandb_run.py:_console_start():2521] atexit reg +2026-02-01 20:27:28,716 INFO MainThread:1578907 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2026-02-01 20:27:28,716 INFO MainThread:1578907 [wandb_run.py:_redirect():2438] Wrapping output streams. +2026-02-01 20:27:28,716 INFO MainThread:1578907 [wandb_run.py:_redirect():2461] Redirects installed. +2026-02-01 20:27:28,726 INFO MainThread:1578907 [wandb_init.py:init():1081] run started, returning control to user process diff --git a/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/run-lly0j9zs.wandb b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/run-lly0j9zs.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/run-lly0j9zs.wandb differ diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-01/22-02-10/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa2926ecb633cc627e36315302088546c50453ef --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de72ae678462cbdd9eff945fc9c5cf1e363eb8af --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5ec0c1ce0fd1bcc5ac8574bcccdd4650aef0317 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/18-55-54/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..857e0a79019b5711eb7377126a063d42afed23fb --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.8 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e8abfafb0f6bb0261095aa43a6e040c407d8f111 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.8 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.8,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1ea253ab12d257896733b5b02335994363d0ff7e --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.8 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-37-47/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0c2bc7bf17311667e76f7481048c869e0814be5 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.85 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 20 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 5 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..92b5144edb482d95b09b7fafae1228062bd4358b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.85 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.85,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2890e9a59c960ee8661bdbdd6ed0b91b44ed0d12 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.85 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=20 +- trainer.test_freq=5 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-45-54/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1325dcb8376c0482fa06f7fa92cd5021b0e4aa01 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=20 + - trainer.test_freq=5 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-optimized-multiclinsum-gs,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=20,trainer.test_freq=5,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-06/20-52-38/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..760d4957b3736a567e9cdd8914ee58513bd7aca6 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41a5c80023d58fa230bd12905b1088c3c96f960d --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..08f9c1747d2eba6bdf69386139b1a462af7ac88f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/main_ppo.log b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-03-35/main_ppo.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4278d13794d03e5c28a182f64003e9597bceda16 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e1b950a9b8d064316fca1c72f7a0927d7b7fb3e --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7fb5e8a3f8e7fcd48a17a4988f7055a143adf972 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac65617b049ea128ccda97a95eea9f8217f0bb52 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier +- trainer.total_epochs=15