diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/aime2024_multiturn_w_tool.py b/code/RL_model/verl/verl_train/examples/data_preprocess/aime2024_multiturn_w_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..76cdd0576d3801118b160b850bcbd8d2fe6723b1 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/aime2024_multiturn_w_tool.py @@ -0,0 +1,79 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the DAPO-Math-17k dataset to multiturn format +""" + +import argparse +import os + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/retool_aime2024", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_path = "BytedTsinghua-SIA/AIME-2024" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path, "default") + else: + dataset = datasets.load_dataset(data_path, "default") + + train_dataset = dataset["train"] + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + orig_extra_info = example.pop("extra_info") + extra_info = orig_extra_info.copy() + extra_info["need_tools_kwargs"] = True + extra_info["tools_kwargs"] = { + "code_interpreter": { + "create_kwargs": { + "ground_truth": example["reward_model"]["ground_truth"], + }, + }, + } + example["extra_info"] = extra_info + return example + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/dapo_multiturn_w_tool.py b/code/RL_model/verl/verl_train/examples/data_preprocess/dapo_multiturn_w_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..aab356f41bf38e789a31f1ee879ce9beb8b0aa40 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/dapo_multiturn_w_tool.py @@ -0,0 +1,79 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the DAPO-Math-17k dataset to multiturn format +""" + +import argparse +import os + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/retool_dapo", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_path = "BytedTsinghua-SIA/DAPO-Math-17k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path, "default") + else: + dataset = datasets.load_dataset(data_path, "default") + + train_dataset = dataset["train"] + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + orig_extra_info = example.pop("extra_info") + extra_info = orig_extra_info.copy() + extra_info["need_tools_kwargs"] = True + extra_info["tools_kwargs"] = { + "code_interpreter": { + "create_kwargs": { + "ground_truth": example["reward_model"]["ground_truth"], + }, + }, + } + example["extra_info"] = extra_info + return example + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/full_hh_rlhf.py b/code/RL_model/verl/verl_train/examples/data_preprocess/full_hh_rlhf.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8a148df1e322f476cedffe4eadc5ae6ee9b6f1 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/full_hh_rlhf.py @@ -0,0 +1,161 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +- Preprocess data and split the training set into 75% for training RM and 25% for validting RM. +- All the training data is used to train SFT and RL. +- Both chosen and rejected is used to train SFT +""" + +import argparse +import os + +import pandas as pd +from datasets import load_dataset +from tqdm.auto import tqdm + +from verl.utils.fs import copy, makedirs + + +def generate_sft_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/sft", local_dataset_path=None): + if local_dataset_path is not None: + dataset = load_dataset(local_dataset_path) + else: + dataset = load_dataset("Dahoas/full-hh-rlhf") + output = {"prompt": [], "response": []} + for data in tqdm(dataset["train"]): + # add chosen + output["prompt"].append(data["prompt"]) + output["response"].append(data["chosen"]) + + # add rejection + output["prompt"].append(data["prompt"]) + output["response"].append(data["rejected"]) + + df = pd.DataFrame(output) + + local_dir = os.path.expanduser(local_dir) + os.makedirs(local_dir, exist_ok=True) + + local_path = os.path.join(local_dir, "train.parquet") + + df.to_parquet(path=local_path) + + if target_hdfs_path_dir is not None: + hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet" + makedirs(hdfs_dir) + + copy(local_path, hdfs_dir) + + +def generate_rm_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/rm", local_dataset_path=None): + if local_dataset_path is not None: + train_dataset = load_dataset(local_dataset_path, split="train[:75%]") + test_dataset = load_dataset(local_dataset_path, split="train[-25%:]") + else: + train_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[:75%]") + test_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[-25%:]") + + local_dir = os.path.expanduser(local_dir) + os.makedirs(local_dir, exist_ok=True) + + for dataset, name in zip([train_dataset, test_dataset], ["train", "test"], strict=True): + output = {"prompt": [], "chosen": [], "rejected": []} + for data in tqdm(dataset): + # add chosen + output["prompt"].append(data["prompt"]) + output["chosen"].append(data["chosen"]) + output["rejected"].append(data["rejected"]) + + df = pd.DataFrame(output) + + local_path = os.path.join(local_dir, name + ".parquet") + + df.to_parquet(path=local_path) + + if target_hdfs_path_dir is not None: + hdfs_dir = target_hdfs_path_dir + "/" + name + ".parquet" + makedirs(hdfs_dir) + + copy(local_path, hdfs_dir) + + +def generate_rl_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlhf/rl", local_dataset_path=None): + if local_dataset_path is not None: + dataset = load_dataset(local_dataset_path) + else: + dataset = load_dataset("Dahoas/full-hh-rlhf") + train_dataset = dataset["train"] + + data_source = "Dahoas/full-hh-rlhf" + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + prompt = example.pop("prompt") + response = example.pop("response") + + data = { + "data_source": data_source, + "prompt": [{"role": "user", "content": prompt}], + "ability": "alignment", + "reward_model": { + "style": "model", + "ground_truth": response, # should not be used + }, + "extra_info": {"split": split, "index": idx}, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + local_dir = os.path.expanduser(local_dir) + local_path = os.path.join(local_dir, "train.parquet") + train_dataset.to_parquet(local_path) + + if target_hdfs_path_dir is not None: + hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet" + makedirs(hdfs_dir) + + copy(local_path, hdfs_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--split", type=str, choices=["sft", "rm", "rl"], required=True) + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", type=str, required=False, default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", + type=str, + default="~/data/full_hh_rlhf", + help="The save directory for the preprocessed dataset.", + ) + + args = parser.parse_args() + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + if args.split == "sft": + generate_sft_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path) + elif args.split == "rm": + generate_rm_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path) + elif args.split == "rl": + generate_rl_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path) + else: + raise NotImplementedError diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/geo3k.py b/code/RL_model/verl/verl_train/examples/data_preprocess/geo3k.py new file mode 100644 index 0000000000000000000000000000000000000000..ba84fd3fc440761a200d0fbdea1535bfe9889b45 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/geo3k.py @@ -0,0 +1,102 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the Geometry3k dataset to parquet format +""" + +import argparse +import os + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None) + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/geo3k", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "hiyouga/geometry3k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset( + local_dataset_path, + ) + else: + dataset = datasets.load_dataset( + data_source, + ) + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = ( + r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. " + r"The reasoning process MUST BE enclosed within tags. " + r"The final answer MUST BE put in \boxed{}." + ) + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + problem = example.pop("problem") + prompt = problem + " " + instruction_following + answer = example.pop("answer") + images = example.pop("images") + + data = { + "data_source": data_source, + "prompt": [ + { + "role": "user", + "content": prompt, + } + ], + "images": images, + "ability": "math", + "reward_model": {"style": "rule", "ground_truth": answer}, + "extra_info": { + "split": split, + "index": idx, + "answer": answer, + "question": problem, + }, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/geo3k_multiturn_w_tool.py b/code/RL_model/verl/verl_train/examples/data_preprocess/geo3k_multiturn_w_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..53c7197f9d2d00ccf256aee57e2de1847a926725 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/geo3k_multiturn_w_tool.py @@ -0,0 +1,120 @@ +# Copyright 2023-2025 SGLang Team +# Copyright Amazon.com, Inc. or its affiliates. +# Copyright 2025 Reallm Labs Ltd. or its affiliates +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Preprocess the Geometry3k dataset to parquet format +""" + +import argparse +import os + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", + default="~/data/geo3k_multiturn_w_tool", + help="The save directory for the preprocessed dataset.", + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "hiyouga/geometry3k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path) + else: + dataset = datasets.load_dataset(data_source) + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = ( + r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. " + r"The reasoning process MUST BE enclosed within tags. " + r"The final answer MUST BE put in \boxed{}." + ) + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + problem = example.pop("problem") + prompt = problem + " " + instruction_following + answer = example.pop("answer") + images = example.pop("images") + data = { + "data_source": data_source, + "prompt": [ + { + "role": "system", + "content": ( + "You are a math expert. You are given a question and you need to solve it step by step. " + "Reasoning step by step before any tool call. " + "You should use the `calc_geo3k_reward` tool after step by step solving the question, " + "before generate final answer at least once and refine your answer if necessary. " + ), + }, + { + "role": "user", + "content": prompt, + }, + ], + "images": images, + "ability": "math", + "reward_model": {"style": "rule", "ground_truth": answer}, + "extra_info": { + "split": split, + "index": idx, + "answer": answer, + "question": problem, + "need_tools_kwargs": True, + "tools_kwargs": { + "calc_geo3k_reward": { + "create_kwargs": {"ground_truth": answer}, + # "execute_kwargs": {}, + # "calc_reward_kwargs": {}, + # "release_kwargs": {}, + }, + }, + }, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k.py b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k.py new file mode 100644 index 0000000000000000000000000000000000000000..1656cdbc896a8f14fc7e09705d36335f52165533 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k.py @@ -0,0 +1,105 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the GSM8k dataset to parquet format +""" + +import argparse +import os +import re + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + + +def extract_solution(solution_str): + solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) + assert solution is not None + final_solution = solution.group(0) + final_solution = final_solution.split("#### ")[1].replace(",", "") + return final_solution + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "openai/gsm8k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path, "main") + else: + dataset = datasets.load_dataset(data_source, "main") + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = 'Let\'s think step by step and output the final answer after "####".' + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + question_raw = example.pop("question") + + question = question_raw + " " + instruction_following + + answer_raw = example.pop("answer") + solution = extract_solution(answer_raw) + data = { + "data_source": data_source, + "prompt": [ + { + "role": "user", + "content": question, + } + ], + "ability": "math", + "reward_model": {"style": "rule", "ground_truth": solution}, + "extra_info": { + "split": split, + "index": idx, + "answer": answer_raw, + "question": question_raw, + }, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_sft.py b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_sft.py new file mode 100644 index 0000000000000000000000000000000000000000..4589362f933aa95493fdd98ce965eb810180c98a --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_sft.py @@ -0,0 +1,102 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the GSM8k dataset to parquet format +""" + +import argparse +import os +import re + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + + +def extract_solution(solution_str): + solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) + assert solution is not None + final_solution = solution.group(0) + final_solution = final_solution.split("#### ")[1].replace(",", "") + return final_solution + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/gsm8k_sft", help="The save directory for the preprocessed dataset." + ) + parser.add_argument("--hdfs_dir", default=None) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "openai/gsm8k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path, "main") + else: + dataset = datasets.load_dataset(data_source, "main") + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = 'Let\'s think step by step and output the final answer after "####".' + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + question_raw = example.pop("question") + + question = question_raw + " " + instruction_following + + answer_raw = example.pop("answer") + data = { + "messages": [ + { + "role": "user", + "content": question, + }, + { + "role": "assistant", + "content": answer_raw, + }, + ], + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) + + hdfs_dir = args.hdfs_dir + + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + local_save_dir = os.path.expanduser(local_save_dir) + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_interaction.py b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_interaction.py new file mode 100644 index 0000000000000000000000000000000000000000..c06b325c3e8076c0caff1360920f86cdd2f33bd2 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_interaction.py @@ -0,0 +1,119 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the GSM8k dataset to parquet format +""" + +import argparse +import os +import re + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + + +def extract_solution(solution_str): + solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) + assert solution is not None + final_solution = solution.group(0) + final_solution = final_solution.split("#### ")[1].replace(",", "") + return final_solution + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "openai/gsm8k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path, "main") + else: + dataset = datasets.load_dataset(data_source, "main") + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = "Let's think step by step and output the final answer after `####`." + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + question_raw = example.pop("question") + + question = question_raw + " " + instruction_following + + answer_raw = example.pop("answer") + solution = extract_solution(answer_raw) + data = { + "data_source": data_source, + "prompt": [ + { + "role": "system", + "content": ( + "You are a math expert. You are given a question and you need to solve it step by step. " + "You should rethinking carefully if user point out your answer is wrong. " + "Put your final answer in the format of `#### `." + ), + }, + { + "role": "user", + "content": question, + }, + ], + "ability": "math", + "reward_model": {"style": "rule", "ground_truth": solution}, + "extra_info": { + "split": split, + "index": idx, + "answer": answer_raw, + "question": question_raw, + "interaction_kwargs": { + "name": "gsm8k", + "query": question, + "ground_truth": solution, + }, + }, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_tool.py b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..083550ad7f160a5caac97d85ee33164b0437119d --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_tool.py @@ -0,0 +1,129 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the GSM8k dataset to parquet format +""" + +import argparse +import os +import re + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + + +def extract_solution(solution_str): + solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) + assert solution is not None + final_solution = solution.group(0) + final_solution = final_solution.split("#### ")[1].replace(",", "") + return final_solution + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "openai/gsm8k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path, "main") + else: + dataset = datasets.load_dataset(data_source, "main") + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = "Let's think step by step and output the final answer after `####`." + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + question_raw = example.pop("question") + + question = question_raw + " " + instruction_following + + answer_raw = example.pop("answer") + solution = extract_solution(answer_raw) + data = { + "data_source": data_source, + "prompt": [ + { + "role": "system", + "content": ( + "You are a math expert. You are given a question and you need to solve it step by step. " + "Reasoning step by step before any tool call. " + "You should use the `calc_gsm8k_reward` tool after step by step solving the question, " + "before generate final answer at least once and refine your answer if necessary. " + "Put your final answer in the format of `#### `." + ), + }, + { + "role": "user", + "content": question, + }, + ], + "ability": "math", + "reward_model": {"style": "rule", "ground_truth": solution}, + "extra_info": { + "split": split, + "index": idx, + "answer": answer_raw, + "question": question_raw, + "need_tools_kwargs": True, + "tools_kwargs": { + "calc_gsm8k_reward": { + "create_kwargs": {"ground_truth": solution}, + # "execute_kwargs": {}, + # "calc_reward_kwargs": {}, + # "release_kwargs": {}, + }, + }, + "interaction_kwargs": { + "query": question, + "ground_truth": solution, + }, + }, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_tool_agent_loop.py b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_tool_agent_loop.py new file mode 100644 index 0000000000000000000000000000000000000000..743d7c5f154b2fa4c5fcc0103a9311578b8298b9 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_tool_agent_loop.py @@ -0,0 +1,130 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# Copyright 2025 ModelBest Inc. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the GSM8k dataset to parquet format +""" + +import argparse +import os +import re + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + + +def extract_solution(solution_str): + solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) + assert solution is not None + final_solution = solution.group(0) + final_solution = final_solution.split("#### ")[1].replace(",", "") + return final_solution + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "openai/gsm8k" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path, "main") + else: + dataset = datasets.load_dataset(data_source, "main") + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = "Let's think step by step and output the final answer after `####`." + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + question_raw = example.pop("question") + + question = question_raw + " " + instruction_following + + answer_raw = example.pop("answer") + solution = extract_solution(answer_raw) + data = { + "data_source": data_source, + "agent_name": "tool_agent", + "prompt": [ + { + "role": "system", + "content": ( + "You are a math expert. You are given a question and you need to solve it step by step. " + "Reasoning step by step before any tool call. " + "You should use the `calc_gsm8k_reward` tool after step by step solving the question, " + "before generate final answer at least once and refine your answer if necessary. " + "Put your final answer in the format of `#### `." + ), + }, + { + "role": "user", + "content": question, + }, + ], + "ability": "math", + "reward_model": {"style": "rule", "ground_truth": solution}, + "extra_info": { + "split": split, + "index": idx, + "answer": answer_raw, + "question": question_raw, + "need_tools_kwargs": True, + "tools_kwargs": { + "calc_gsm8k_reward": { + "create_kwargs": {"ground_truth": solution}, + # "execute_kwargs": {}, + # "calc_reward_kwargs": {}, + # "release_kwargs": {}, + }, + }, + "interaction_kwargs": { + "query": question, + "ground_truth": solution, + }, + }, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/hellaswag.py b/code/RL_model/verl/verl_train/examples/data_preprocess/hellaswag.py new file mode 100644 index 0000000000000000000000000000000000000000..dc73a810a80570d406bb727099f5524037be2370 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/hellaswag.py @@ -0,0 +1,108 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess Hellaswag dataset. + +""" + +import argparse +import os +import re + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + + +def preprocess(text): + text = text.strip() + # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.") + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/hellaswag", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "Rowan/hellaswag" + + if local_dataset_path is not None: + dataset = datasets.load_dataset(local_dataset_path) + else: + dataset = datasets.load_dataset(data_source, trust_remote_code=True) + + train_dataset = dataset["train"] + val_dataset = dataset["validation"] + test_dataset = dataset["test"] + + instruction = "Please complete the following sentence.\n" + + def make_map_fn(split): + def process_fn(doc, idx): + ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() + query = preprocess(doc["activity_label"] + ": " + ctx) + choices = [preprocess(ending) for ending in doc["endings"]] + gold = int(doc["label"]) + + data = { + "data_source": data_source, + "prompt": [{"role": "user", "content": query}], + "ability": "nlp", + "reward_model": { + "style": "model", + "eval": "multiple_choice", # using loglikelihood + "ground_truth": gold, + "choices": choices, + }, + "extra_info": {"split": split, "index": idx}, + } + return data + + return process_fn + + # filter data that doesn't have a label + train_dataset = train_dataset.filter(lambda x: len(x["label"]) > 0) + val_dataset = val_dataset.filter(lambda x: len(x["label"]) > 0) + test_dataset = test_dataset.filter(lambda x: len(x["label"]) > 0) + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + val_dataset = val_dataset.map(function=make_map_fn("validation"), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + val_dataset.to_parquet(os.path.join(local_save_dir, "validation.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/math_dataset.py b/code/RL_model/verl/verl_train/examples/data_preprocess/math_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b23a032fb1207a47dcd1bc77194a7c1a124aad55 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/math_dataset.py @@ -0,0 +1,106 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the MATH-lighteval dataset to parquet format +""" + +import argparse +import json +import os + +import datasets + +from verl.utils.hdfs_io import copy, makedirs +from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed + + +def extract_solution(solution_str): + return remove_boxed(last_boxed_only_string(solution_str)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None) + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", default="~/data/math", help="The save directory for the preprocessed dataset." + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + # 'lighteval/MATH' is no longer available on huggingface. + # Use mirror repo: DigitalLearningGmbH/MATH-lighteval + data_source = "DigitalLearningGmbH/MATH-lighteval" + print(f"Loading the {data_source} dataset from huggingface...", flush=True) + if local_dataset_path is not None: + dataset = datasets.load_dataset( + local_dataset_path, + ) + else: + dataset = datasets.load_dataset( + data_source, + ) + + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + instruction_following = "Let's think step by step and output the final answer within \\boxed{}." + + # add a row to each data item that represents a unique id + def make_map_fn(split): + def process_fn(example, idx): + question = example.pop("problem") + + question = question + " " + instruction_following + + answer = example.pop("solution") + solution = extract_solution(answer) + data = { + "data_source": data_source, + "prompt": [{"role": "user", "content": question}], + "ability": "math", + "reward_model": {"style": "rule", "ground_truth": solution}, + "extra_info": {"split": split, "index": idx}, + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True) + + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + local_dir = os.path.expanduser(local_save_dir) + hdfs_dir = args.hdfs_dir + + train_dataset.to_parquet(os.path.join(local_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_dir, "test.parquet")) + # Save one example as JSON for reference + example = train_dataset[0] + with open(os.path.join(local_dir, "train_example.json"), "w") as f: + json.dump(example, f, indent=2) + example = test_dataset[0] + with open(os.path.join(local_dir, "test_example.json"), "w") as f: + json.dump(example, f, indent=2) + if hdfs_dir is not None: + makedirs(hdfs_dir) + + copy(src=local_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/multiturn.py b/code/RL_model/verl/verl_train/examples/data_preprocess/multiturn.py new file mode 100644 index 0000000000000000000000000000000000000000..4bf0192b817d3acfc41f7e4a74a4a891d3ae61d6 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/multiturn.py @@ -0,0 +1,125 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Create a simple multi-turn dataset for testing +""" + +import argparse +import os + +import pandas as pd + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default="~/data/multiturn") + parser.add_argument("--hdfs_dir", default=None) + args = parser.parse_args() + + # Create example conversations + conversations = [] + + # Conversation 1 + conversations.append( + { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "The capital of France is Paris."}, + {"role": "user", "content": "And what about Germany?"}, + {"role": "assistant", "content": "The capital of Germany is Berlin."}, + ] + } + ) + + # Conversation 2 + conversations.append( + { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Can you explain quantum computing?"}, + { + "role": "assistant", + "content": "Quantum computing is a type of computing that uses quantum-mechanical phenomena, " + "such as superposition and entanglement, to perform operations on data.", + }, + {"role": "user", "content": "How is it different from classical computing?"}, + { + "role": "assistant", + "content": "Classical computing uses bits that are either 0 or 1, while quantum computing uses " + "quantum bits or qubits that can exist in multiple states simultaneously due to superposition.", + }, + ] + } + ) + + # Conversation 3 + conversations.append( + { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Write a simple Python function to calculate factorial."}, + { + "role": "assistant", + "content": ( + "```python\ndef factorial(n):\n if n == 0 or n == 1:\n return 1\n else:\n " + "return n * factorial(n-1)\n```\n\nThis is a recursive function to calculate the " + "factorial of a number." + ), + }, + {"role": "user", "content": "Can you make it iterative instead?"}, + { + "role": "assistant", + "content": ( + "```python\ndef factorial(n):\n result = 1\n for i in range(1, n+1):\n " + "result *= i\n return result\n```\n\nThis is an iterative version of the factorial function." + ), + }, + ] + } + ) + + # Create train and test datasets + train_data = conversations[:2] # First 2 conversations for training + test_data = conversations[2:] # Last conversation for testing + + # Create output directory + local_dir = os.path.expanduser(args.local_dir) + os.makedirs(local_dir, exist_ok=True) + + # Save to parquet files + train_df = pd.DataFrame(train_data) + test_df = pd.DataFrame(test_data) + + train_df.to_parquet(os.path.join(local_dir, "train.parquet")) + test_df.to_parquet(os.path.join(local_dir, "test.parquet")) + + # Handle HDFS if specified + if args.hdfs_dir is not None: + try: + from verl.utils.hdfs_io import copy, makedirs + + makedirs(args.hdfs_dir) + copy(src=local_dir, dst=args.hdfs_dir) + except ImportError: + print("Warning: HDFS support not available. Skipping HDFS copy.") + + # Print statistics + print(f"Train dataset size: {len(train_df)}") + print(f"Test dataset size: {len(test_df)}") + print(f"Data saved to {local_dir}") + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/pokemon.py b/code/RL_model/verl/verl_train/examples/data_preprocess/pokemon.py new file mode 100644 index 0000000000000000000000000000000000000000..3bbf4d4b46ee98669eaa40a9a9084f918791f50d --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/pokemon.py @@ -0,0 +1,75 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +""" +Preprocess the llamafactory/pokemon-gpt4o-captions dataset to parquet format +""" + +import argparse +import os + +import datasets + +from verl.utils.hdfs_io import copy, makedirs + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=None) + parser.add_argument("--hdfs_dir", default=None) + parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.") + parser.add_argument( + "--local_save_dir", + default="~/data/pokemon-gpt4o-captions", + help="The save directory for the preprocessed dataset.", + ) + + args = parser.parse_args() + local_dataset_path = args.local_dataset_path + + data_source = "llamafactory/pokemon-gpt4o-captions" + + if local_dataset_path is not None: + dataset = datasets.load_dataset( + local_dataset_path, + ) + else: + dataset = datasets.load_dataset( + data_source, + ) + + def map_fn(row: dict): + messages = [] + conversation = row.pop("conversations") + for conv in conversation: + if conv["from"] == "gpt": + role = "assistant" + elif conv["from"] == "human": + role = "user" + else: + raise ValueError(f"Unknown role: {conv['from']}") + messages.append( + { + "role": role, + "content": conv["value"], + } + ) + + row["messages"] = messages + return row + + dataset = dataset["train"].map(map_fn, num_proc=16) + dataset = dataset.train_test_split(test_size=0.1) + train_dataset = dataset["train"] + test_dataset = dataset["test"] + + hdfs_dir = args.hdfs_dir + local_save_dir = args.local_dir + if local_save_dir is not None: + print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.") + else: + local_save_dir = args.local_save_dir + + train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet")) + test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet")) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_save_dir, dst=hdfs_dir) diff --git a/code/RL_model/verl/verl_train/examples/data_preprocess/preprocess_search_r1_dataset.py b/code/RL_model/verl/verl_train/examples/data_preprocess/preprocess_search_r1_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a0c10d59b9c006ae7234ce21f7bdb25562259b23 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/data_preprocess/preprocess_search_r1_dataset.py @@ -0,0 +1,178 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +import tempfile + +import pandas as pd +from huggingface_hub import hf_hub_download +from huggingface_hub.utils import EntryNotFoundError + +from verl.utils.hdfs_io import copy, makedirs + +# Setup logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +# Configuration constants +DEFAULT_SYSTEM_CONTENT = "You are a helpful and harmless assistant." +DEFAULT_USER_CONTENT_PREFIX = ( + "Answer the given question. You must conduct reasoning inside and " + "first every time you get new information. After reasoning, if you find you lack " + "some knowledge, you can call a search engine by query " + "and it will return the top searched results between and " + ". You can search as many times as your want. If you find no " + "further external knowledge needed, you can directly provide the answer inside " + " and , without detailed illustrations. For example, " + " Beijing . Question: " +) + + +def process_single_row(row, current_split_name, row_index): + """ + Process a single row of data for SearchR1-like format. + + Args: + row: DataFrame row containing the original data + current_split_name: Name of the current split (train/test) + row_index: Index of the row in the DataFrame + + Returns: + pd.Series: Processed row data in the required format + """ + question = row.get("question", "") + + # Build prompt structure + user_content = user_content_prefix.rstrip("\n") + question + prompt = [{"role": "system", "content": system_content}, {"role": "user", "content": user_content}] + + # Extract ground truth from reward_model or fallback to golden_answers + reward_model_data = row.get("reward_model") + if isinstance(reward_model_data, dict) and "ground_truth" in reward_model_data: + ground_truth = reward_model_data.get("ground_truth") + else: + ground_truth = row.get("golden_answers", []) + + # Process data source + data_source_tagged = "searchR1_" + str(row.get("data_source", "")) + + # Build tools kwargs structure + tools_kwargs = { + "search": { + "create_kwargs": {"ground_truth": ground_truth, "question": question, "data_source": data_source_tagged} + } + } + + # Build complete extra_info structure + extra_info = { + "index": row_index, + "need_tools_kwargs": True, + "question": question, + "split": current_split_name, + "tools_kwargs": tools_kwargs, + } + + return pd.Series( + { + "data_source": data_source_tagged, + "prompt": prompt, + "ability": row.get("ability"), + "reward_model": reward_model_data, + "extra_info": extra_info, + "metadata": row.get("metadata"), + } + ) + + +def main(): + local_save_dir = os.path.expanduser(args.local_dir) + os.makedirs(local_save_dir, exist_ok=True) + + processed_files = [] + + # Download and process files using temporary directory + with tempfile.TemporaryDirectory() as tmp_download_dir: + for split in ["train", "test"]: + parquet_filename = f"{split}.parquet" + logger.info(f"Processing {split} split...") + + try: + # Download Parquet file from HuggingFace + logger.info(f"Downloading {parquet_filename} from {args.hf_repo_id}") + local_parquet_filepath = hf_hub_download( + repo_id=args.hf_repo_id, + filename=parquet_filename, + repo_type="dataset", + local_dir=tmp_download_dir, + local_dir_use_symlinks=False, + ) + + # Load and process Parquet file + df_raw = pd.read_parquet(local_parquet_filepath) + logger.info(f"Loaded {len(df_raw)} rows from {parquet_filename}") + + def apply_process_row(row, split_name=split): + return process_single_row(row, current_split_name=split_name, row_index=row.name) + + df_processed = df_raw.apply(apply_process_row, axis=1) + + # Save processed DataFrame + output_file_path = os.path.join(local_save_dir, f"{split}.parquet") + df_processed.to_parquet(output_file_path, index=False) + logger.info(f"Saved {len(df_processed)} processed rows to {output_file_path}") + processed_files.append(output_file_path) + + except EntryNotFoundError: + logger.warning(f"{parquet_filename} not found in repository {args.hf_repo_id}") + except Exception as e: + logger.error(f"Error processing {split} split: {e}") + + if not processed_files: + logger.warning("No data was processed or saved") + return + + logger.info(f"Successfully processed {len(processed_files)} files to {local_save_dir}") + + # Copy to HDFS if specified + if args.hdfs_dir: + try: + makedirs(args.hdfs_dir) + copy(src=local_save_dir, dst=args.hdfs_dir) + logger.info(f"Successfully copied files to HDFS: {args.hdfs_dir}") + except Exception as e: + logger.error(f"Error copying files to HDFS: {e}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download Search-R1 from HuggingFace, process, and save to Parquet.") + parser.add_argument( + "--hf_repo_id", default="PeterJinGo/nq_hotpotqa_train", help="HuggingFace dataset repository ID." + ) + parser.add_argument( + "--local_dir", + default="~/data/searchR1_processed_direct", + help="Local directory to save the processed Parquet files.", + ) + parser.add_argument("--hdfs_dir", default=None, help="Optional HDFS directory to copy the Parquet files to.") + + args = parser.parse_args() + + # System and user content configuration + system_content = DEFAULT_SYSTEM_CONTENT + user_content_prefix = DEFAULT_USER_CONTENT_PREFIX + + main() diff --git a/code/RL_model/verl/verl_train/examples/gmpo_trainer/README.md b/code/RL_model/verl/verl_train/examples/gmpo_trainer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..71d0bb212235ad7e1297822ad6c130711d19d262 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/gmpo_trainer/README.md @@ -0,0 +1,59 @@ +
+ +# Geometric-Mean Policy Optimization +
+ +This is the official implementaion of paper [***Geometric-Mean Policy Optimization***](https://arxiv.org/abs/2507.20673). + +
+image +
+ +## 1. Contents +- Geometric-Mean Policy Optimization + - [1. Contents](#1-contents) + - [2. Introduction](#2-introduction) + - [3. Code Usage](#3-code-usage) + - [4. Contacts](#4-contacts) + - [5. Citation](#5-citation) + +## 2. Introduction + +Group Relative Policy Optimization (GRPO) has significantly enhanced the reasoning capability of large language models by optimizing the arithmetic mean of token-level rewards. Unfortunately, GRPO is observed to suffer from unstable policy updates when facing tokens with outlier importance-weighted rewards, which manifest as extreme importance sampling ratios during training. In this study, we propose Geometric-Mean Policy Optimization (GMPO), with the aim to improve the stability of GRPO through suppressing token reward outliers. Instead of optimizing the arithmetic mean, GMPO maximizes the geometric mean of token-level rewards, which is inherently less sensitive to outliers and maintains a more stable range of importance sampling ratio. GMPO is plug-and-play—simply replacing GRPO's arithmetic mean with the geometric mean of token-level rewards, as the latter is inherently less sensitive to outliers. GMPO is theoretically plausible—analysis reveals that both GMPO and GRPO are weighted forms of the policy gradient while the former enjoys more stable weights, which consequently benefits policy optimization and performance. Experiments on multiple mathematical reasoning benchmarks show that GMPO-7B improves the average Pass@1 of GRPO by up to 4.1%, outperforming many state-of-the-art approaches. + +## 3. Code Usage + +The key configurations are: +``` +clip_ratio_low=0.4 +clip_ratio_high=0.4 +loss_mode=geo_mean +``` +We observed that using a large clip ratio during Mixture-of-Experts (MoE) model training often leads to optimization instability. When training MoE models, consider lowering the clip ratio to achieve more stable convergence. +To get started quickly, run: +``` +bash examples/gmpo_trainer/run_qwen2_5-7b_math.sh +``` + +GMPO can be combined with other methods such as DAPO (experimental - not fully tested): +``` +bash examples/gmpo_trainer/test_dapo_7b_math.sh +bash examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh +``` + +## 4. Contacts +If you have any question about our work or this repository, please don't hesitate to contact us by emails or open an issue under this project. +- [zhaoyuzhong20@mails.ucas.ac.cn](zhaoyuzhong20@mails.ucas.ac.cn) +- [liuyue171@mails.ucas.ac.cn](liuyue171@mails.ucas.ac.cn) +- [lecu@microsoft.com](lecu@microsoft.com) +- [wanfang@ucas.ac.cn](wanfang@ucas.ac.cn) + +## 5. Citation +``` +@article{zhao2025geometric, + title={Geometric-mean policy optimization}, + author={Zhao, Yuzhong and Liu, Yue and Liu, Junpeng and Chen, Jingye and Wu, Xun and Hao, Yaru and Lv, Tengchao and Huang, Shaohan and Cui, Lei and Ye, Qixiang and others}, + journal={arXiv preprint arXiv:2507.20673}, + year={2025} +} +``` diff --git a/code/RL_model/verl/verl_train/examples/gmpo_trainer/run_qwen2_5-7b_math.sh b/code/RL_model/verl/verl_train/examples/gmpo_trainer/run_qwen2_5-7b_math.sh new file mode 100644 index 0000000000000000000000000000000000000000..06ad91c9fa47cf685425335a37af7eeb3eab15b6 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/gmpo_trainer/run_qwen2_5-7b_math.sh @@ -0,0 +1,60 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +use_kl_loss=False +loss_mode=geo_mean +clip_ratio=0.4 +save_contents="['model', 'optimizer', 'extra']" + +export WANDB_MODE=offline +save_contents="['hf_model']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-Math-7B \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.checkpoint.save_contents=${save_contents} \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_gmpo_example_gsm8k_math' \ + trainer.experiment_name='qwen2_5_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_7b_math.sh b/code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_7b_math.sh new file mode 100644 index 0000000000000000000000000000000000000000..a355c859b80d05754836fa87314289986ebfef67 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_7b_math.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen2.5-7b-MATH-0527a1' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.4 +clip_ratio_high=0.4 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=4 +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +fsdp_size=32 + +loss_mode=geo_mean + +# export WANDB_MODE=offline +save_contents="['model', 'optimizer', 'extra']" +# save_contents="['hf_model']" + +# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361 + +python3 -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.model.use_remove_padding=True \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","wandb"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=10 \ + trainer.save_freq=10 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=200 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh b/code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh new file mode 100644 index 0000000000000000000000000000000000000000..c63805a3baa17b4d75e0c675f9a4f0be24cd1976 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen3-30B-A3B-Base-MATH-0527a1' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.4 +clip_ratio_high=0.4 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=512 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +loss_mode=geo_mean + +# export WANDB_MODE=offline +save_contents="['model', 'optimizer', 'extra']" +# save_contents="['hf_model']" + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +NNODES=${NNODES:-8} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-30B-A3B-Base"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=4 +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +fsdp_size=32 + +python3 -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","wandb"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=True \ + trainer.test_freq=10 \ + trainer.save_freq=10 \ + trainer.total_epochs=10 \ + trainer.total_training_steps=300 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..46788e16f5bcf31c53ac3ee489743ee4bd985a8d --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh @@ -0,0 +1,50 @@ +set -x + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo --config-path=config \ + --config-name='ppo_megatron_trainer.yaml'\ + algorithm.adv_estimator=grpo \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_gsm8k_math' \ + trainer.experiment_name='deepseek_llm_7b_math_megatron' \ + trainer.n_gpus_per_node=16 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_glm41v_9b.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_glm41v_9b.sh new file mode 100644 index 0000000000000000000000000000000000000000..a845bcc244f79ae7301a04c0e010a2586d528166 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_glm41v_9b.sh @@ -0,0 +1,46 @@ +set -x +ENGINE=${1:-vllm} + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/geo3k/train.parquet \ + data.val_files=$HOME/data/geo3k/test.parquet \ + data.train_batch_size=512 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.image_key=images \ + actor_rollout_ref.model.path=zai-org/GLM-4.1V-9B-Thinking \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.01 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=$ENGINE \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.rollout.enforce_eager=False \ + actor_rollout_ref.rollout.free_cache_engine=True \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_geo3k' \ + trainer.experiment_name='glm41v_9b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_minicpmo2_6.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_minicpmo2_6.sh new file mode 100644 index 0000000000000000000000000000000000000000..d1daab99a9fb16e6698ad8a7a22d7ea64e091281 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_minicpmo2_6.sh @@ -0,0 +1,49 @@ +set -x + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/geo3k/train.parquet \ + data.val_files=$HOME/data/geo3k/test.parquet \ + data.train_batch_size=128 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=False \ + data.truncation='error' \ + data.image_key=images \ + data.trust_remote_code=True \ + data.custom_cls.path=recipe/minicpmo/rl_dataset.py \ + data.custom_cls.name=RLHFDataset \ + actor_rollout_ref.model.path=openbmb/MiniCPM-o-2_6 \ + actor_rollout_ref.model.trust_remote_code=True \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.use_dynamic_bsz=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.fsdp_config.use_orig_params=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.rollout.enforce_eager=False \ + actor_rollout_ref.rollout.free_cache_engine=False \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_geo3k' \ + trainer.experiment_name='minicpmo2_6_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..5dc4ec87fa75512d24f76e2875b60efc3ffb9090 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh @@ -0,0 +1,47 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo --config-path=config \ + --config-name='ppo_megatron_trainer.yaml'\ + algorithm.adv_estimator=grpo \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_gsm8k' \ + trainer.experiment_name='qwen2_7b_function_rm_megatron' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-32b_grpo_megatron_vllm_npu.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-32b_grpo_megatron_vllm_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..42abb8597b2189f005059fc8f9243b2e07a37425 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-32b_grpo_megatron_vllm_npu.sh @@ -0,0 +1,185 @@ +#!/bin/bash +set -xeuo pipefail +mkdir -p logs + +# Project Configuration +project_name='GRPO-Qwen2.5-32B-BASE-MATH' +exp_name='GRPO-Qwen2.5-32B-BASE-Megatron-vLLM' + +# Node Info +NNODES=${NNODES:-1} +NPUS_PER_NODE=${NPUS_PER_NODE:-16} + +# Model Weights Paths +MODEL_PATH=Qwen/Qwen2.5-32B +MCORE_MODEL_PATH=Qwen/Qwen2.5-32B-dist +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} + +# File System Paths +TRAIN_FILE=$RAY_DATA_HOME/dataset/gsm8k/train.parquet +TEST_FILE=$RAY_DATA_HOME/dataset/gsm8k/test.parquet + +# Data Configuration +max_prompt_length=$((1024 * 1)) +max_response_length=$((1024 * 1)) + +# Training Batch Configuration +train_prompt_bsz=128 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +# Algorithm Configuration +adv_estimator=grpo +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=True +kl_loss_coef=0.001 + +# Performance and Memory Management Configuration +all_offload=True +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 8)) +optimizer_offload_fraction=1 + +# Megatron Configuration +train_tp=4 +train_ep=1 +train_etp=1 +train_pp=4 +train_cp=1 + +# vLLM Configuration +gen_tp=2 +gen_dp=1 +gen_ep=1 +gpu_memory_utilization=0.8 +max_model_len=$((max_prompt_length + max_response_length)) +max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1)) + +# Data Configuration +DATA_CONFIG=( + data.train_files="${TRAIN_FILE}" + data.val_files="${TEST_FILE}" + data.prompt_key=prompt + data.train_batch_size=${train_prompt_bsz} + data.max_prompt_length=${max_prompt_length} + data.max_response_length=${max_response_length} + data.filter_overlong_prompts=False + data.truncation='left' +) + +# Model Configuration +MODEL_CONFIG=( + actor_rollout_ref.model.path="${MODEL_PATH}" + actor_rollout_ref.model.use_remove_padding=True +) + +# Algorithm Configuration +ALGORITHM_CONFIG=( + algorithm.adv_estimator=${adv_estimator} + algorithm.use_kl_in_reward=${use_kl_in_reward} + algorithm.kl_ctrl.kl_coef=${kl_coef} +) + +# Actor Model Configuration +ACTOR_CONFIG=( + actor_rollout_ref.actor.use_torch_compile=False + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} + actor_rollout_ref.actor.kl_loss_type=low_var_kl + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} + actor_rollout_ref.actor.entropy_coeff=0 + actor_rollout_ref.actor.ppo_epochs=1 + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} + actor_rollout_ref.actor.kl_loss_type=low_var_kl + actor_rollout_ref.actor.optim.lr=1e-6 + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} + +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} + actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp} + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep} + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp} + actor_rollout_ref.actor.megatron.param_offload=${all_offload} + actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload} + actor_rollout_ref.actor.megatron.grad_offload=${all_offload} + actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH} + actor_rollout_ref.actor.megatron.use_dist_checkpointing=False + +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True +) + +# Reference Model Configuration +REF_CONFIG=( + actor_rollout_ref.ref.use_torch_compile=False + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} + actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp} + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep} + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp} + actor_rollout_ref.ref.megatron.param_offload=${all_offload} + actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH} + actor_rollout_ref.ref.megatron.use_dist_checkpointing=False +) + +# Rollout Configuration +ROLLOUT_CONFIG=( + actor_rollout_ref.rollout.name=vllm + actor_rollout_ref.rollout.n=${n_resp_per_prompt} + actor_rollout_ref.rollout.top_p=1.0 + actor_rollout_ref.rollout.top_k=-1 + actor_rollout_ref.rollout.temperature=1.0 + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} + actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} + actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens} + actor_rollout_ref.rollout.max_model_len=${max_model_len} + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} + actor_rollout_ref.rollout.data_parallel_size=${gen_dp} + actor_rollout_ref.rollout.expert_parallel_size=${gen_ep} + actor_rollout_ref.rollout.enable_chunked_prefill=True + actor_rollout_ref.rollout.enable_prefix_caching=True + actor_rollout_ref.rollout.enforce_eager=True + actor_rollout_ref.rollout.free_cache_engine=True + actor_rollout_ref.rollout.val_kwargs.n=1 + actor_rollout_ref.rollout.val_kwargs.do_sample=True + actor_rollout_ref.rollout.val_kwargs.top_p=1.0 + actor_rollout_ref.rollout.val_kwargs.top_k=-1 + actor_rollout_ref.rollout.val_kwargs.temperature=1.0 +) + +# Trainer Configuration +TRAINER_CONFIG=( + trainer.logger='["console","tensorboard"]' + trainer.project_name="${project_name}" + trainer.experiment_name="${exp_name}" + trainer.nnodes="${NNODES}" + trainer.n_gpus_per_node="${NPUS_PER_NODE}" + trainer.device='npu' + trainer.total_epochs=15 + trainer.val_before_train=False + trainer.test_freq=-1 + trainer.save_freq=-1 + trainer.default_local_dir="${CKPTS_DIR}" +) + +# Main GRPO Training Command +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + "${DATA_CONFIG[@]}" \ + "${MODEL_CONFIG[@]}" \ + "${ACTOR_CONFIG[@]}" \ + "${REF_CONFIG[@]}" \ + "${ROLLOUT_CONFIG[@]}" \ + "${ALGORITHM_CONFIG[@]}" \ + "${TRAINER_CONFIG[@]}" \ + "$@" | tee logs/run_qwen2_5-32b_grpo_megatron_vllm_npu.log diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh new file mode 100644 index 0000000000000000000000000000000000000000..6496974d50889bd8b82d91a098847dfe0f127463 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh @@ -0,0 +1,47 @@ +set -x + +lora_adapter_path=${lora_adapter_path:-/path/saved/lora_adapter} + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.shuffle=False \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.model.use_shm=True \ + actor_rollout_ref.model.lora_adapter_path=${lora_adapter_path} \ + actor_rollout_ref.actor.optim.lr=3e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.rollout.load_format=safetensors \ + actor_rollout_ref.rollout.layered_summon=True \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_gsm8k' \ + trainer.experiment_name='qwen2.5_3b_grpo_lora' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..963e75a6343805afc1296f128719972271c3a39b --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh @@ -0,0 +1,68 @@ +set -x + +# profiling configuration +PROFILE_STEPS="[2,4]" +PROFILE_RANKS_ALL=True +DISCRETE=False + +# profiling NPU options +SAVE_PATH="$HOME/profile_data" +LEVEL="level0" +CONTENTS=['npu','cpu'] +ANALYSIS=True + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=32 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=5e-8 \ + actor_rollout_ref.model.use_remove_padding=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=2 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.profiler.enable=True \ + actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \ + actor_rollout_ref.actor.profiler.tool_config.npu.discrete=$DISCRETE \ + actor_rollout_ref.actor.profiler.tool_config.npu.contents=$CONTENTS \ + actor_rollout_ref.actor.profiler.tool_config.npu.level=$LEVEL \ + actor_rollout_ref.actor.profiler.tool_config.npu.analysis=$ANALYSIS \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ + actor_rollout_ref.rollout.n=4 \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.ref.profiler.enable=True \ + actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \ + actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \ + actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \ + actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \ + actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=console \ + trainer.project_name='verl_grpo_example_gsm8k' \ + trainer.experiment_name='qwen2_5_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=5 \ + global_profiler.tool=npu \ + global_profiler.steps=$PROFILE_STEPS \ + global_profiler.save_path=$SAVE_PATH + $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b.sh new file mode 100644 index 0000000000000000000000000000000000000000..b64ec094118bfece1ee081326f82bd0813b835c6 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b.sh @@ -0,0 +1,47 @@ +set -x +ENGINE=${1:-vllm} + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/geo3k/train.parquet \ + data.val_files=$HOME/data/geo3k/test.parquet \ + data.train_batch_size=512 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.image_key=images \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.use_fused_kernels=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.01 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=$ENGINE \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.rollout.enforce_eager=False \ + actor_rollout_ref.rollout.free_cache_engine=True \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_geo3k' \ + trainer.experiment_name='qwen2_5_vl_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-32b_npu.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-32b_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea4883f951605aed31d48278e8c242e2820849ee --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-32b_npu.sh @@ -0,0 +1,58 @@ +set -x + +project_name='GRPO-Qwen3' +exp_name='GRPO-Qwen3-32b-npu' +gen_tp=4 +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-32B"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/gsm8k/train.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/gsm8k/test.parquet"} + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.train_batch_size=1024 \ + data.max_prompt_length=2048 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.shuffle=False \ + actor_rollout_ref.model.path=${MODEL_PATH} \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=4 \ + +actor_rollout_ref.actor.fsdp_config.mixed_precision.param_dtype=bf16 \ + +actor_rollout_ref.actor.fsdp_config.mixed_precision.reduce_dtype=bf16 \ + +actor_rollout_ref.actor.fsdp_config.mixed_precision.buffer_dtype=fp32 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + actor_rollout_ref.rollout.n=4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.ref.use_torch_compile=False \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=32768 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=['console','tensorboard'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=4 \ + trainer.resume_from_path=checkpoints/ \ + trainer.save_freq=500 \ + trainer.test_freq=50 \ + trainer.total_epochs=50 $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b.sh new file mode 100644 index 0000000000000000000000000000000000000000..a99b432d6abe46a7c62f69e47398ef99b10aa5c2 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b.sh @@ -0,0 +1,43 @@ +# Tested successfully on the hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0 image. +# It outperforms the Qwen2 7B base model by two percentage points on the test set of GSM8K. + +set -x + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen3-8B \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_gsm8k' \ + trainer.experiment_name='qwen3_8b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..878b106f9f17996cc2c2c1c951b7128c886cd6bc --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh @@ -0,0 +1,71 @@ +set -x +export HCCL_CONNECT_TIMEOUT=1500 +export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050 +export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050 +export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 +# WORKSPACE_HOME and DATA_HOME support custom path configuration. +WORKSPACE_HOME=$pwd +DATA_HOME=$pwd + +sp_size=4 +num_npu=4 +tp_size=4 +train_prompt_bsz=16 +train_prompt_mini_bsz=16 + +max_prompt_length=512 +max_response_length=1024 + +CKPTS_DIR=$WORKSPACE_HOME/logs/ckpt/qwen3_8b +model_path=$DATA_HOME/models/Qwen3-8B +train_data=$DATA_HOME/datasets/processed_gsm8k/train.parquet +valid_data=$DATA_HOME/datasets/processed_gsm8k/test.parquet + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$train_data \ + data.val_files=$valid_data \ + data.train_batch_size=$train_prompt_bsz \ + data.max_prompt_length=$max_prompt_length \ + data.max_response_length=$max_response_length \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=$model_path \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=$train_prompt_mini_bsz \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$tp_size \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ + actor_rollout_ref.rollout.n=5 \ + +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.nccl_timeout=1800 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=console \ + trainer.val_before_train=False \ + trainer.project_name='verl_grpo_example_512_1024_gsm8k' \ + trainer.experiment_name='qwen3_8b_function_rm' \ + trainer.n_gpus_per_node=$num_npu \ + trainer.nnodes=1 \ + trainer.save_freq=1000 \ + trainer.test_freq=10000 \ + trainer.total_epochs=5 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-8b-megatron.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-8b-megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..69739c2d512baa6999c2022e32049d3bb3466293 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-8b-megatron.sh @@ -0,0 +1,86 @@ +set -x +ENGINE=${1:-vllm} +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + +# dependency: vllm>=0.11.0, megatron-lm>=0.13, mbridge with qwen3vl_cp branch +# environment option1: use a stable container later than docker://verlai/verl:vllm011.dev6 + # and install mbridge in it by following the instruction in the container + # pip remove mbridge if you have installed it + # pip install git+https://github.com/ISEEKYAN/mbridge.git@qwen3vl_cp # for correct mbridge +# environment option2: use container docker://verlai/verl:vllm011.dev_qwenvl_cp + + +export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP + + +HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-8B-Instruct"} + +GEN_TP=${GEN_TP:-4} +CP=${CP:-2} +TP=${TP:-2} +PP=${PP:-2} + +train_path=$HOME/data/geo3k/train.parquet +test_path=$HOME/data/geo3k/test.parquet + +python3 -m verl.trainer.main_ppo --config-path=config \ + --config-name='ppo_megatron_trainer.yaml'\ + algorithm.adv_estimator=grpo \ + data.train_files="$train_path" \ + data.val_files="$test_path" \ + data.train_batch_size=512 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=$HF_MODEL_PATH \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.actor.megatron.context_parallel_size=$CP \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.01 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \ + actor_rollout_ref.rollout.name=$ENGINE \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.megatron.use_mbridge=True \ + actor_rollout_ref.actor.megatron.param_offload=True \ + actor_rollout_ref.actor.megatron.optimizer_offload=True \ + actor_rollout_ref.actor.megatron.grad_offload=True \ + actor_rollout_ref.ref.megatron.param_offload=True \ + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \ + +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \ + +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \ + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_geo3k' \ + trainer.experiment_name='qwen3_vl_8b_megatron' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_grpo_megatron_vllm_npu.sh b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_grpo_megatron_vllm_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..6c4ef91a5c702c734b628f683d567892bfd52409 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_grpo_megatron_vllm_npu.sh @@ -0,0 +1,188 @@ +#!/bin/bash +set -xeuo pipefail +mkdir -p logs + +# Project Configuration +project_name='GRPO-Qwen3-30b-A3B-BASE-MATH' +exp_name='GRPO-Qwen3-30B-A3B-BASE-Megatron-vLLM' + +# Node Info +NNODES=${NNODES:-1} +NPUS_PER_NODE=${NPUS_PER_NODE:-16} + +# Model Weights Paths +MODEL_PATH=Qwen/Qwen3-30B-A3B-Base +MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-Base-dist +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} + +# File System Paths +TRAIN_FILE=$RAY_DATA_HOME/dataset/gsm8k/train.parquet +TEST_FILE=$RAY_DATA_HOME/dataset/gsm8k/test.parquet + +# Data Configuration +max_prompt_length=$((1024 * 1)) +max_response_length=$((1024 * 1)) + +# Training Batch Configuration +train_prompt_bsz=128 +train_prompt_mini_bsz=32 +n_resp_per_prompt=16 + +# Algorithm Configuration +adv_estimator=grpo +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=True +kl_loss_coef=0.001 + +# Performance and Memory Management Configuration +all_offload=True +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 4)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 8)) +optimizer_offload_fraction=1 + +# Megatron Configuration +train_tp=2 +train_ep=8 +train_etp=1 +train_pp=2 +train_cp=1 + +# vLLM Configuration +gen_tp=2 +gen_dp=1 +gen_ep=1 +gpu_memory_utilization=0.8 +max_model_len=$((max_prompt_length + max_response_length)) +max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1)) + +# Data Configuration +DATA_CONFIG=( + data.train_files="${TRAIN_FILE}" + data.val_files="${TEST_FILE}" + data.prompt_key=prompt + data.train_batch_size=${train_prompt_bsz} + data.max_prompt_length=${max_prompt_length} + data.max_response_length=${max_response_length} + data.filter_overlong_prompts=False + data.truncation='left' +) + +# Model Configuration +MODEL_CONFIG=( + actor_rollout_ref.model.path="${MODEL_PATH}" + actor_rollout_ref.model.use_remove_padding=True +) + +# Algorithm Configuration +ALGORITHM_CONFIG=( + algorithm.adv_estimator=${adv_estimator} + algorithm.use_kl_in_reward=${use_kl_in_reward} + algorithm.kl_ctrl.kl_coef=${kl_coef} +) + +# Actor Model Configuration +ACTOR_CONFIG=( + actor_rollout_ref.actor.use_torch_compile=False + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} + actor_rollout_ref.actor.kl_loss_type=low_var_kl + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} + actor_rollout_ref.actor.entropy_coeff=0 + actor_rollout_ref.actor.ppo_epochs=1 + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} + actor_rollout_ref.actor.kl_loss_type=low_var_kl + actor_rollout_ref.actor.optim.lr=1e-6 + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} + +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} + actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp} + actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep} + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp} + actor_rollout_ref.actor.megatron.param_offload=${all_offload} + actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload} + actor_rollout_ref.actor.megatron.grad_offload=${all_offload} + actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH} + actor_rollout_ref.actor.megatron.use_dist_checkpointing=False + +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 +) + +# Reference Model Configuration +REF_CONFIG=( + actor_rollout_ref.ref.use_torch_compile=False + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} + actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp} + actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep} + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp} + actor_rollout_ref.ref.megatron.param_offload=${all_offload} + actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH} + actor_rollout_ref.ref.megatron.use_dist_checkpointing=False +) + +# Rollout Configuration +ROLLOUT_CONFIG=( + actor_rollout_ref.rollout.name=vllm + actor_rollout_ref.rollout.n=${n_resp_per_prompt} + actor_rollout_ref.rollout.top_p=1.0 + actor_rollout_ref.rollout.top_k=-1 + actor_rollout_ref.rollout.temperature=1.0 + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} + actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} + actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens} + actor_rollout_ref.rollout.max_model_len=${max_model_len} + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} + actor_rollout_ref.rollout.data_parallel_size=${gen_dp} + actor_rollout_ref.rollout.expert_parallel_size=${gen_ep} + actor_rollout_ref.rollout.enable_chunked_prefill=True + actor_rollout_ref.rollout.enable_prefix_caching=True + actor_rollout_ref.rollout.enforce_eager=True + actor_rollout_ref.rollout.free_cache_engine=True + actor_rollout_ref.rollout.val_kwargs.n=1 + actor_rollout_ref.rollout.val_kwargs.do_sample=True + actor_rollout_ref.rollout.val_kwargs.top_p=1.0 + actor_rollout_ref.rollout.val_kwargs.top_k=-1 + actor_rollout_ref.rollout.val_kwargs.temperature=1.0 +) + +# Trainer Configuration +TRAINER_CONFIG=( + trainer.logger='["console","tensorboard"]' + trainer.project_name="${project_name}" + trainer.experiment_name="${exp_name}" + trainer.nnodes="${NNODES}" + trainer.n_gpus_per_node="${NPUS_PER_NODE}" + trainer.device='npu' + trainer.total_epochs=15 + trainer.val_before_train=False + trainer.test_freq=-1 + trainer.save_freq=-1 + trainer.default_local_dir="${CKPTS_DIR}" +) + +# Main GRPO Training Command +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + "${DATA_CONFIG[@]}" \ + "${MODEL_CONFIG[@]}" \ + "${ACTOR_CONFIG[@]}" \ + "${REF_CONFIG[@]}" \ + "${ROLLOUT_CONFIG[@]}" \ + "${ALGORITHM_CONFIG[@]}" \ + "${TRAINER_CONFIG[@]}" \ + "$@" | tee logs/run_qwen3moe-30b_grpo_megatron_vllm_npu.log diff --git a/code/RL_model/verl/verl_train/examples/mtp_trainer/runtime_env.yaml b/code/RL_model/verl/verl_train/examples/mtp_trainer/runtime_env.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cda072e6d0a7fde3861f39a795e69a77c33d2e46 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/mtp_trainer/runtime_env.yaml @@ -0,0 +1,17 @@ +working_dir: ./ + +excludes: + - ".git/" + +env_vars: + VLLM_USE_V1: "1" + HYDRA_FULL_ERROR: "1" + NCCL_NVLS_ENABLE: "0" + NCCL_SOCKET_IFNAME: "eth0" + TMPDIR: "/tmp" + CUDA_HOME: "/usr/local/cuda" + CUDA_TMPDIR: "/tmp" + CUDA_CACHE_PATH: "/tmp/cuda_cache" + # For distributed training, the path must be set on a distributed file system (DFS) to ensure visibility across all nodes. + HF_HOME: "/tmp/hf_home_mimo" + PYTHONPATH: "/tmp/hf_home_mimo/modules/" diff --git a/code/RL_model/verl/verl_train/examples/mtp_trainer/test_dapo_mimo_7b_with_mtp_math_megatron.sh b/code/RL_model/verl/verl_train/examples/mtp_trainer/test_dapo_mimo_7b_with_mtp_math_megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..ef1d21f0158344b4776ddd5c18b2021abd99e9d3 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/mtp_trainer/test_dapo_mimo_7b_with_mtp_math_megatron.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash + +set -xeuo pipefail + +project_name='DAPO' +exp_name='DAPO-mimo-7b-rl-megatron' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +train_prompt_bsz=128 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 + +# Ray +# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +# WORKING_DIR=${WORKING_DIR:-"${PWD}"} +# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/examples/mtp_trainer/runtime_env.yaml"} +NNODES=${NNODES:-16} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/MiMo-7B-RL"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3)) +offload=True +gen_tp=4 +train_tp=2 +train_pp=2 +train_cp=2 + +common_params=( +actor_rollout_ref.model.mtp.enable=True +actor_rollout_ref.model.mtp.enable_train=True +actor_rollout_ref.model.mtp.mtp_loss_scaling_factor=0.1 +actor_rollout_ref.model.mtp.detach_encoder=True +) + +python -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.optim.clip_grad=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp} \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + reward_model.reward_manager=dapo \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger='["console","tensorboard"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=False \ + trainer.test_freq=10 \ + trainer.save_freq=-1 \ + trainer.total_epochs=10 \ + trainer.resume_mode=auto \ + trainer.log_val_generations=10 \ + actor_rollout_ref.rollout.disable_log_stats=False \ + actor_rollout_ref.rollout.prometheus.enable=True \ + actor_rollout_ref.rollout.prometheus.port=44398 \ + actor_rollout_ref.model.trust_remote_code=True \ + data.trust_remote_code=True \ + trainer.total_training_steps=400 \ + actor_rollout_ref.actor.megatron.use_mbridge=True \ + "${common_params[@]}" diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/README.md b/code/RL_model/verl/verl_train/examples/ppo_trainer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cf037fc5cecb38393661676d3a389579f705fe29 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/README.md @@ -0,0 +1,103 @@ +# Proximal Policy Optimization (PPO) + +Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning. + +Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from: + +- High variance and sample inefficiency. +- Instability due to large policy updates. + +PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives. + +For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347). + +## Key Components + +- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model. + +- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias. + +- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates. + +## Configuration + +Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior. + +Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below. + +![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d) + +- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n` + +- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers + +- `critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers + +- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2 + +- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor + +- `critic.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic. Defaults to `actor_rollout_ref.actor.ppo_epochs` + +- `algorithm.gamma`: discount factor + +- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator + +- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo, rloo_vectorized + +## Advanced Extensions + +### KL Divergence Control + +Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) + +Options to use KL loss for KL divergence control: + +- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False + +- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001. + +- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html + +Options to use KL penalty in the reward: + +- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False. + +- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html + +- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001. +- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController. +- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details. +- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details. + +### Dual-clip PPO + +The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound. + +![image](https://github.com/user-attachments/assets/fc232181-d8b0-4307-8dd2-4dc0a4c1c139) + +- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0 + +## Reference Example + +Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) + +```bash +bash run_gemma.sh + trainer.n_gpus_per_node=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + trainer.logger=console \ + critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + data.train_batch_size=256 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size=2 \ + critic.ppo_micro_batch_size=2 +``` + +Reference performance with verl v0.2: + +| Model | Method | Score | Link | +|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------| +| Qwen/Qwen2.5-0.5B-Instruct | pretrained model | 36.4 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) | +| Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) | diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm.sh new file mode 100644 index 0000000000000000000000000000000000000000..6a93a75b4035cd21caa8c8b123ec1397b649de62 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm.sh @@ -0,0 +1,42 @@ +set -x + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='deepseek_llm_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=1 \ + trainer.use_legacy_worker_impl=auto \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh new file mode 100644 index 0000000000000000000000000000000000000000..eb6dc79234a14152eb8583e58096e4d4fd8f0d04 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh @@ -0,0 +1,42 @@ +set -x + +VERL_USE_MODELSCOPE=True \ +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='deepseek_llm_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=1 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh new file mode 100644 index 0000000000000000000000000000000000000000..312c6b50b78272e1b0af06fa1b49fcf88f00639b --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh @@ -0,0 +1,45 @@ +set -x + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + algorithm.use_pf_ppo=True \ + algorithm.pf_ppo.reweight_method=pow \ # ["pow", "max_min", "max_random"] + algorithm.pf_ppo.weight_pow=2.0 \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=5 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='deepseek_llm_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=1 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh new file mode 100644 index 0000000000000000000000000000000000000000..69ee7b8bd76518dcb19aaca7d798d4a99a77e784 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh @@ -0,0 +1,44 @@ +set -x + +python3 -m verl.trainer.main_ppo \ + reward_model.sandbox_fusion.url='https://xxxxxxxxx.apigateway-cn-beijing.volceapi.com/run_code' \ + reward_model.sandbox_fusion.max_concurrent=128 \ + reward_model.reward_manager=prime \ + algorithm.adv_estimator=gae \ + data.train_files=$HOME/data/Eurus-2-RL-Data/train.parquet \ + data.val_files=$HOME/data/Eurus-2-RL-Data/validation.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_sandbox_fusion' \ + trainer.experiment_name='deepseek_llm_7b_function_sandbox_fusion' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=1 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh new file mode 100644 index 0000000000000000000000000000000000000000..3cb8a852b5ffd3eea40781b421157d699434408b --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh @@ -0,0 +1,43 @@ +set -x + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.ulysses_sequence_parallel_size=2 \ + critic.model.use_remove_padding=True \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=64 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='deepseek_llm_7b_function_rm_sp2' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh new file mode 100644 index 0000000000000000000000000000000000000000..aa2b3e4a118dca4b7c2d1ae1695b987a5e81192d --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh @@ -0,0 +1,45 @@ +set -x + +train_files=$HOME/data/full_hh_rlhf/rl/train.parquet +test_files=$HOME/data/full_hh_rlhf/rl/train.parquet # no use + +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=512 \ + data.max_prompt_length=128 \ + data.max_response_length=128 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + critic.optim.lr=1e-5 \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.ppo_micro_batch_size_per_gpu=4 \ + reward_model.enable=True \ + reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=4 \ + reward_model.rollout.prompt_length=256 \ + reward_model.rollout.response_length=128 \ + reward_model.num_workers=8 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_megatron_full_hh_rlhf_examples' \ + trainer.experiment_name='deepseek_llm_7b_model_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=100 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..a128aabf30abb87553b31e217c09d8f4166acb43 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh @@ -0,0 +1,49 @@ +set -x + +# Example runnable on H20 * 8 + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \ + critic.optim.lr=1e-5 \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_ppo_gsm8k_math_examples' \ + trainer.experiment_name='deepseek_llm_7b_megatron' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=100 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh new file mode 100644 index 0000000000000000000000000000000000000000..e467c3a5c3f97dad99e2345870239e99970f8a70 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh @@ -0,0 +1,65 @@ +set -x + +# Example runnable on H20 * 8 + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files=${train_files:-"$gsm8k_train_path"} +test_files=${test_files:-"$gsm8k_test_path"} + +# Nsight profiling configuration +PROFILE_STEPS="[1]" # or [] or null +PROFILE_RANKS_ALL=False # or True +PROFILE_RANKS=[0,4] +DISCRETE=True # or True + +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.profiler.enable=True \ + actor_rollout_ref.actor.profiler.ranks=$PROFILE_RANKS \ + actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \ + critic.optim.lr=1e-5 \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.ppo_micro_batch_size_per_gpu=4 \ + critic.profiler.enable=True \ + critic.profiler.ranks=$PROFILE_RANKS \ + critic.profiler.all_ranks=$PROFILE_RANKS_ALL \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_ppo_gsm8k_math_examples' \ + trainer.experiment_name='deepseek_llm_7b_megatron' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=-1 \ + trainer.total_epochs=100 \ + trainer.total_training_steps=1 \ + global_profiler.tool=nsys \ + global_profiler.steps=$PROFILE_STEPS \ + global_profiler.global_tool_config.nsys.discrete=$DISCRETE $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_gemma.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_gemma.sh new file mode 100644 index 0000000000000000000000000000000000000000..b015275c13496ae2514db6c756114d76897c7f71 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_gemma.sh @@ -0,0 +1,40 @@ +set -x + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=512 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=google/gemma-2-2b-it \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=False \ + actor_rollout_ref.actor.ppo_mini_batch_size=128 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=False \ + critic.model.path=google/gemma-2-2b-it \ + critic.model.enable_gradient_checkpointing=False \ + critic.ppo_micro_batch_size_per_gpu=4 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example' \ + trainer.experiment_name='gemma2b_function_rm' \ + trainer.n_gpus_per_node=2 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..5070708b214a2d34549aa8a64445a53716573ea0 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh @@ -0,0 +1,106 @@ +set -x + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + + +# 0. download the model +hf download moonshotai/Moonlight-16B-A3B-Instruct + +# 1. convert the model to mcore format +# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path +HF_MODEL_PATH=/data/models/moonshotai/Moonlight-16B-A3B-Instruct +DIST_CKPT_PATH=/data/mcore_ckpt/Moonlight-16B-A3B-Instruct +python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH + + +# 2. run the script +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +train_files=$gsm8k_train_path +test_files=$gsm8k_test_path + +ALL_OFFLOAD=${ALL_OFFLOAD:-False} +COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD} +COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD} +COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD} + +ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} +ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD} +ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD} +REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} +CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} +CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD} +CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD} +RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD} + + +NODES=4 +PP=2 +TP=8 +EP=8 +ETP=1 +VLLM_TP=4 + +# RAY_ADDRESS='auto' ray job submit --working-dir . -- +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.trust_remote_code=True \ + actor_rollout_ref.model.path=$LLM \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + critic.optim.lr=1e-5 \ + critic.model.path=$LLM \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_megatron_gsm8k_examples' \ + trainer.experiment_name='moonlight_16b_a3b_instruct_1node' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=$NODES \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + actor_rollout_ref.model.trust_remote_code=True \ + critic.model.trust_remote_code=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=13 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \ + critic.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \ + critic.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \ + critic.megatron.expert_model_parallel_size=$EP \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \ + critic.megatron.expert_tensor_parallel_size=$ETP \ + actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \ + actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \ + actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \ + critic.megatron.param_offload=${CRITIC_PARAM_OFFLOAD} \ + critic.megatron.optimizer_offload=${CRITIC_OPTIMIZER_OFFLOAD} \ + critic.megatron.grad_offload=${CRITIC_GRAD_OFFLOAD} \ + actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \ + critic.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + trainer.val_before_train=False \ + trainer.total_epochs=100 $@ + \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..68854b703a48b7fdfa0a463be81e1c121ccbc53e --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh @@ -0,0 +1,73 @@ +set -x + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + +# 0. download the model +#hf download Qwen/Qwen1.5-MoE-A2.7B-Chat + +# 1. convert the model to mcore format +# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path +HF_MODEL_PATH=/data/models/Qwen/Qwen1.5-MoE-A2.7B-Chat +DIST_CKPT_PATH=/data/mcore_ckpt/Qwen1.5-MoE-A2.7B-Chat +python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH + +# 2. run the script +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +train_files=$gsm8k_train_path +test_files=$gsm8k_test_path + +NODES=4 +PP=2 +TP=4 +CP=1 +VLLM_TP=4 + +# RAY_ADDRESS='auto' ray job submit --working-dir . -- +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=$HF_MODEL_PATH \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.actor.megatron.context_parallel_size=$CP \ + actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.ref.megatron.context_parallel_size=$CP \ + actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \ + critic.optim.lr=1e-5 \ + critic.model.path=$HF_MODEL_PATH \ + critic.ppo_micro_batch_size_per_gpu=4 \ + critic.megatron.tensor_model_parallel_size=$TP \ + critic.megatron.pipeline_model_parallel_size=$PP \ + critic.megatron.context_parallel_size=$CP \ + critic.megatron.use_dist_checkpointing=True \ + critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_megatron_gsm8k_examples' \ + trainer.experiment_name='qwen1.5_moe_nochat' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=$NODES \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=100 $@ + \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh new file mode 100644 index 0000000000000000000000000000000000000000..934d6e19b4edd9b4001a7a6afcff59d99646eccf --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh @@ -0,0 +1,47 @@ +set -x + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \ + critic.optim.lr=1e-5 \ + critic.model.path=Qwen/Qwen2-7B-Instruct \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_ppo_gsm8k_math_examples' \ + trainer.experiment_name='qwen2_7b_megatron' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=100 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm.sh new file mode 100644 index 0000000000000000000000000000000000000000..baa9294400589bb0e3d6eb5678e8dd09bd459bd0 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm.sh @@ -0,0 +1,75 @@ +# Discliamer: the model used in the script is only for academic purpose. +set -x + +# Data preparation scripts are available in ``examples/data_preprocess``. +# Example usage: +# +# python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math +# python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + + +# prepare model ckpt +hf download Qwen/Qwen2-7B-Instruct --local-dir $HOME/models/Qwen2-7B-Instruct & +hf download sfairXC/FsfairX-LLaMA3-RM-v0.1 --local-dir $HOME/models/FsfairX-LLaMA3-RM-v0.1 & +wait + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2-7B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2-7B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=2048 \ + reward_model.rollout.response_length=1024 \ + reward_model.num_workers=8 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example' \ + trainer.val_before_train=False \ + trainer.experiment_name='Qwen2-7B-Instruct_hybrid_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh new file mode 100644 index 0000000000000000000000000000000000000000..51c5cbee6c36713f09b8a1441c954ca19aaf39fc --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh @@ -0,0 +1,63 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=False \ + reward_model.model.use_remove_padding=True \ + reward_model.model.fsdp_config.param_offload=True \ + reward_model.micro_batch_size_per_gpu=32 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=True \ + trainer.experiment_name='legacy_fsdp_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh new file mode 100644 index 0000000000000000000000000000000000000000..9f9304c3b65ea7e54117758dc7a996687ec112c6 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh @@ -0,0 +1,69 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.prompt_length=4096 \ + reward_model.rollout.response_length=4096 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.num_workers=8 \ + reward_model.model.use_remove_padding=True \ + reward_model.model.fsdp_config.param_offload=True \ + reward_model.micro_batch_size_per_gpu=32 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=False \ + trainer.experiment_name='reward_loop_colocate_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh new file mode 100644 index 0000000000000000000000000000000000000000..902bcb8ede2461ae6a15a2dd2c43f39ba65a922a --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh @@ -0,0 +1,62 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=4096 \ + data.max_prompt_length=4096 \ + data.max_response_length=4096 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=512 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=Qwen/Qwen2-7B-Instruct \ + critic.model.enable_gradient_checkpointing=True \ + critic.use_dynamic_bsz=True \ + critic.ppo_max_token_len_per_gpu=98304 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=8192 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='qwen2-7b_hybrid_rm_bsz8k_p4k_r4k_seq_packing' \ + trainer.n_gpus_per_node=8 \ + trainer.val_before_train=False \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh new file mode 100644 index 0000000000000000000000000000000000000000..fa2c154f3a1e053b928d33d1866cd83226e0f4ca --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh @@ -0,0 +1,66 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +FUSED_KERNEL_BACKEND=triton # or 'torch' for torch backend + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=4096 \ + data.max_prompt_length=4096 \ + data.max_response_length=4096 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.use_fused_kernels=True \ + actor_rollout_ref.model.fused_kernel_options.impl_backend=$FUSED_KERNEL_BACKEND \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=512 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=Qwen/Qwen2-7B-Instruct \ + critic.model.enable_gradient_checkpointing=True \ + critic.use_dynamic_bsz=True \ + critic.ppo_max_token_len_per_gpu=98304 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1 \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=8192 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='qwen2-7b_hybrid_rm_bsz8k_p4k_r4k_seq_packing_fused_kernel' \ + trainer.n_gpus_per_node=8 \ + trainer.val_before_train=False \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh new file mode 100644 index 0000000000000000000000000000000000000000..5ccfe1b3cd5054d1c9a8bfd1e41fa36aa66962e3 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh @@ -0,0 +1,80 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files=${train_files:-"$gsm8k_train_path"} +test_files=${test_files:-"$gsm8k_test_path"} + +PROFILE_STEPS="[1,2,5]" # or [] or null +PROFILE_RANKS_ALL=False # or True +PROFILE_RANKS=[0,4] +DISCRETE=True # or True + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=4096 \ + data.max_prompt_length=4096 \ + data.max_response_length=4096 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=512 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.profiler.enable=True \ + actor_rollout_ref.actor.profiler.ranks=$PROFILE_RANKS \ + actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=Qwen/Qwen2-7B-Instruct \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=2 \ + critic.use_dynamic_bsz=True \ + critic.ppo_max_token_len_per_gpu=98304 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + critic.profiler.enable=True \ + critic.profiler.ranks=$PROFILE_RANKS \ + critic.profiler.all_ranks=$PROFILE_RANKS_ALL \ + reward_model.enable=True \ + reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=8192 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='qwen2-7b_hybrid_rm_bsz8k_p4k_r4k_seq_packing' \ + trainer.n_gpus_per_node=8 \ + trainer.val_before_train=False \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=-1 \ + trainer.total_epochs=15 \ + trainer.total_training_steps=6 \ + global_profiler.profile_continuous_steps=True \ + global_profiler.tool=nsys \ + global_profiler.steps=$PROFILE_STEPS \ + global_profiler.global_tool_config.nsys.discrete=$DISCRETE $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh new file mode 100644 index 0000000000000000000000000000000000000000..f055ea5d4fd155de08ce902216b723fd78d1219d --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh @@ -0,0 +1,58 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +# For async rollout mode, dataset should return raw chat. +rollout_mode="async" +return_raw_chat="True" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.return_raw_chat=$return_raw_chat \ + data.train_batch_size=4096 \ + data.max_prompt_length=4096 \ + data.max_response_length=4096 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=512 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=$rollout_mode \ + actor_rollout_ref.rollout.multi_turn.format=hermes \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=Qwen/Qwen2-7B-Instruct \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_max_token_len_per_gpu=98304 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='qwen2-7b_function_rm_bsz8k_p4k_r4k_seq_packing' \ + trainer.n_gpus_per_node=8 \ + trainer.val_before_train=False \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh new file mode 100644 index 0000000000000000000000000000000000000000..5108e8b5dd92f53d6c822528d3be50983c6044ff --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh @@ -0,0 +1,51 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=4096 \ + data.max_prompt_length=4096 \ + data.max_response_length=4096 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=512 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=Qwen/Qwen2-7B-Instruct \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_max_token_len_per_gpu=98304 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='qwen2-7b_function_rm_bsz8k_p4k_r4k_seq_packing' \ + trainer.n_gpus_per_node=8 \ + trainer.val_before_train=False \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-32b.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-32b.sh new file mode 100644 index 0000000000000000000000000000000000000000..58037658500a443a35424158af3d40fc9b87512c --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-32b.sh @@ -0,0 +1,50 @@ +set -x + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-32B-Instruct \ + actor_rollout_ref.model.enable_gradient_checkpointing=False \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=Qwen/Qwen2.5-32B-Instruct \ + critic.model.enable_gradient_checkpointing=False \ + critic.ppo_micro_batch_size_per_gpu=8 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example' \ + trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=4 \ + trainer.save_freq=20 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh new file mode 100644 index 0000000000000000000000000000000000000000..51c5cbee6c36713f09b8a1441c954ca19aaf39fc --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh @@ -0,0 +1,63 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=False \ + reward_model.model.use_remove_padding=True \ + reward_model.model.fsdp_config.param_offload=True \ + reward_model.micro_batch_size_per_gpu=32 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=True \ + trainer.experiment_name='legacy_fsdp_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh new file mode 100644 index 0000000000000000000000000000000000000000..24fc88faa81f1be21b39c3841a1bcacdfe1c5746 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh @@ -0,0 +1,66 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=4096 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=False \ + trainer.experiment_name='reward_loop_colocate_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen3-8b_npu.sh b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen3-8b_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..a0ada0eb3886dcc69a2f6a963dec73e17df611c3 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen3-8b_npu.sh @@ -0,0 +1,54 @@ +set -x + +export VLLM_USE_V1=1 + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files=$HOME/data/dapo-math-17k.parquet \ + data.val_files=$HOME/data/dapo-math-17k.parquet \ + data.train_batch_size=256 \ + data.max_prompt_length=2000 \ + data.max_response_length=12000 \ + data.shuffle=False \ + actor_rollout_ref.model.path=Qwen/Qwen3-8B \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \ + actor_rollout_ref.rollout.max_num_batched_tokens=14000 \ + actor_rollout_ref.rollout.max_num_seqs=64 \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.enforce_eager=False \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=Qwen/Qwen3-8B \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=1 \ + critic.ulysses_sequence_parallel_size=2 \ + critic.model.fsdp_config.param_offload=True \ + critic.model.fsdp_config.optimizer_offload=True \ + critic.use_dynamic_bsz=True \ + trainer.critic_warmup=0 \ + trainer.logger=console \ + trainer.project_name='verl_example_dapo_math_17k' \ + trainer.experiment_name='qwen3_8b_fsdp' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=20 \ + trainer.test_freq=-1 \ + trainer.val_before_train=False \ + trainer.max_actor_ckpt_to_keep=1 \ + trainer.max_critic_ckpt_to_keep=1 \ + trainer.total_training_steps=100 $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/ray/tutorial.ipynb b/code/RL_model/verl/verl_train/examples/ray/tutorial.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ca176af0f7940f705281de7ce707d1fa27238c02 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/ray/tutorial.ipynb @@ -0,0 +1,963 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0ddc582b", + "metadata": {}, + "source": [ + "# VeRL Ray API Tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "71fe3b94", + "metadata": {}, + "source": [ + "## Chapter 1: Ray Basics" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "1347d381", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "e75b9d44", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "import ray\n", + "import torch\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "2e90ae00", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-01 17:27:19,132\tINFO worker.py:1752 -- Started a local Ray instance.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9cc9d2ccbdfb48918c8fd6cd13a0807a", + "version_major": 2, + "version_minor": 0 + }, + "text/html": [ + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Python version:3.9.2
Ray version:2.10.0
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + "RayContext(dashboard_url='', python_version='3.9.2', ray_version='2.10.0', ray_commit='09abba26b5bf2707639bb637c208d062a47b46f6')" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m(GPUAccumulator pid=224400)\u001b[0m rank 0, value: tensor([1.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=225234)\u001b[0m rank 2, value: tensor([3.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=225607)\u001b[0m rank 0, value: tensor([2.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=226423)\u001b[0m rank 1, value: tensor([3.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulator pid=226857)\u001b[0m rank 3, value: tensor([6.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulatorDecorator pid=227475)\u001b[0m 10\n", + "\u001b[36m(GPUAccumulatorDecorator pid=227475)\u001b[0m rank 0, value: tensor([10.], device='cuda:0')\n", + "\u001b[36m(GPUAccumulatorDecorator pid=227655)\u001b[0m rank 1, value: tensor([11.], device='cuda:0')\n" + ] + } + ], + "source": [ + "# Build a local ray cluster. The head node and worker node are on this machine\n", + "ray.init()" + ] + }, + { + "cell_type": "markdown", + "id": "a127e4e4", + "metadata": {}, + "source": [ + "Implement an Accumulator class." + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "20e7b9a3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "@ray.remote\n", + "class Accumulator:\n", + " def __init__(self):\n", + " self.value = 0\n", + "\n", + " def add(self, x):\n", + " self.value += x\n", + "\n", + " def get_value(self):\n", + " return self.value" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "3b80098c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Instantiate an accumulator. Accumulator can be viewed as a process, acting as an RPC service.\n", + "accumulator = Accumulator.remote()" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "b14b1009", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "value_ref = accumulator.get_value.remote() # Check the current value. Note that this function returns immediately and does not actually wait for the remote execution to complete.\n", + "# Get the value\n", + "value = ray.get(value_ref)\n", + "print(value)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "513a84b3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10\n" + ] + } + ], + "source": [ + "# Accumulate, then check the result.\n", + "accumulator.add.remote(10) # Similarly, the 'add' here will return immediately.\n", + "new_value = ray.get(accumulator.get_value.remote())\n", + "print(new_value)" + ] + }, + { + "cell_type": "markdown", + "id": "3c332fe0", + "metadata": {}, + "source": [ + "## Chapter 2: Resource Pool and RayWorkerGroup\n", + "In the previous example, it was a simple single-process worker. \n", + "In this example, we implement a worker with a GPU and form a RayWorkerGroup. Within this RayWorkerGroup, we implement a simple operation of an accumulator." + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "04229afb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from verl.single_controller.base import Worker\n", + "from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup, merge_resource_pool" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "0d0dbd58", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "resource_pool = RayResourcePool([4], use_gpu=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "68f6838a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "@ray.remote\n", + "class GPUAccumulator(Worker):\n", + " def __init__(self) -> None:\n", + " super().__init__()\n", + " # The initial value of each rank is the same as the rank\n", + " self.value = torch.zeros(size=(1,), device=\"cuda\") + self.rank\n", + "\n", + " def add(self, x):\n", + " self.value += x\n", + " print(f\"rank {self.rank}, value: {self.value}\")\n", + " return self.value.cpu()" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "23aad8fe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[tensor([1.]), tensor([2.]), tensor([3.]), tensor([4.])]\n" + ] + } + ], + "source": [ + "# Each worker's initial value is its rank, and then each rank's value is incremented by 1, so the values obtained on each rank are [1, 2, 3, 4]\n", + "class_with_args = RayClassWithInitArgs(cls=GPUAccumulator)\n", + "worker_group = RayWorkerGroup(resource_pool, class_with_args)\n", + "print(worker_group.execute_all_sync(\"add\", x=[1, 1, 1, 1]))" + ] + }, + { + "cell_type": "markdown", + "id": "e6705284", + "metadata": {}, + "source": [ + "The principle of parameter passing: The input parameter is a list of length world_size, where each element in the list is dispatched respectively to each worker in the RayWorkerGroup. \n", + "The return parameter is also a list, corresponding to the return value of each worker." + ] + }, + { + "cell_type": "markdown", + "id": "d25c2412", + "metadata": {}, + "source": [ + "### GPU Resource Sharing" + ] + }, + { + "cell_type": "markdown", + "id": "f74f6d24", + "metadata": {}, + "source": [ + "RayWorkerGroups mapped to the same resource pool share the GPU. In this example, we implement three resource pools: the first occupies 4 GPUs, the second also occupies 4 GPUs, and the last occupies all 8 GPUs. Among them, the first resource pool reuses the resource pool mentioned above." + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "49f9c06f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Create a new resource pool and then merge the newly created resource pool with the previous one.\n", + "resource_pool_1 = RayResourcePool([4], use_gpu=True, name_prefix=\"a\")\n", + "resource_pool_merge = merge_resource_pool(resource_pool, resource_pool_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "id": "05c2e305", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Establish a RayWorkerGroup on the newly created resource pool.\n", + "worker_group_1 = RayWorkerGroup(resource_pool_1, class_with_args)\n", + "worker_group_merge = RayWorkerGroup(resource_pool_merge, class_with_args)" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "6b9b13f4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[tensor([2.]), tensor([3.]), tensor([4.]), tensor([5.])]\n" + ] + } + ], + "source": [ + "# Run 'add' on the second set of 4 GPUs; the result should be [2, 3, 4, 5].\n", + "output_1 = worker_group_1.execute_all_sync(\"add\", x=[2, 2, 2, 2])\n", + "print(output_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "d856d030", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[tensor([3.]), tensor([4.]), tensor([5.]), tensor([6.]), tensor([7.]), tensor([8.]), tensor([9.]), tensor([10.])]\n" + ] + } + ], + "source": [ + "# Run 'add' on the merged set of 8 GPUs; the result should be [3, 4, 5, 6, 7, 8, 9, 10].\n", + "output_merge = worker_group_merge.execute_all_sync(\"add\", x=[3, 3, 3, 3, 3, 3, 3, 3])\n", + "print(output_merge)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "33a4628c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 4 8\n" + ] + } + ], + "source": [ + "print(worker_group.world_size, worker_group_1.world_size, worker_group_merge.world_size)" + ] + }, + { + "cell_type": "markdown", + "id": "3df19d13", + "metadata": {}, + "source": [ + "## Chapter 3: Data Dispatch, Execution and Collection" + ] + }, + { + "cell_type": "markdown", + "id": "acb22d9d", + "metadata": {}, + "source": [ + "In the above example, we used the `execute_all_sync` function in the RayWorkerGroup to dispatch data from the driver to each worker. This is very inconvenient for coding. \n", + "In this chapter, we use the form of function decorators to allow RayWorkerGroup to directly call functions written in the Worker, and to greatly simplify parameter passing." + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "35237432", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from verl.single_controller.base.decorator import Dispatch, Execute, register" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "id": "88b8ba3b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "@ray.remote\n", + "class GPUAccumulatorDecorator(Worker):\n", + " def __init__(self) -> None:\n", + " super().__init__()\n", + " # The initial value of each rank is the same as the rank\n", + " self.value = torch.zeros(size=(1,), device=\"cuda\") + self.rank\n", + "\n", + " # map from a single input to all the worker\n", + " @register(Dispatch.ONE_TO_ALL)\n", + " def add(self, x):\n", + " print(x)\n", + " self.value = self.value + x\n", + " print(f\"rank {self.rank}, value: {self.value}\")\n", + " return self.value.cpu()" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "eddaa043", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class_with_args = RayClassWithInitArgs(cls=GPUAccumulatorDecorator)\n", + "gpu_accumulator_decorator = RayWorkerGroup(resource_pool_merge, class_with_args)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "10087c91", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[tensor([10.]), tensor([11.]), tensor([12.]), tensor([13.]), tensor([14.]), tensor([15.]), tensor([16.]), tensor([17.])]\n" + ] + } + ], + "source": [ + "# As we can see, 10 is automatically dispatched to each Worker in this RayWorkerGroup.\n", + "print(gpu_accumulator_decorator.add(x=10))" + ] + }, + { + "cell_type": "markdown", + "id": "540ee6ad", + "metadata": {}, + "source": [ + "### Custom Dispatch, Collection\n", + "Users can customize `dispatch` and `collection` function. You only need to write the `dispatch_fn` and `collect_fn` functions yourself. We also support executing RPC only on rank_zero, with specific examples provided below." + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "8e041270", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from verl.single_controller.base.decorator import Dispatch, collect_all_to_all, register" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "43b5be31", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def two_to_all_dispatch_fn(worker_group, *args, **kwargs):\n", + " \"\"\"\n", + " Assume the input is a list of 2. Duplicate the input interleaved and pass to each worker.\n", + " \"\"\"\n", + " for arg in args:\n", + " assert len(arg) == 2\n", + " for i in range(worker_group.world_size - 2):\n", + " arg.append(arg[i % 2])\n", + " for k, v in kwargs.items():\n", + " assert len(v) == 2\n", + " for i in range(worker_group.world_size - 2):\n", + " v.append(v[i % 2])\n", + " return args, kwargs\n", + "\n", + "\n", + "@ray.remote\n", + "class TestActor(Worker):\n", + " # TODO: pass *args and **kwargs is bug prone and not very convincing\n", + " def __init__(self, x) -> None:\n", + " super().__init__()\n", + " self._x = x\n", + "\n", + " def foo(self, y):\n", + " return self._x + y\n", + "\n", + " @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)\n", + " def foo_rank_zero(self, x, y):\n", + " return self._x + y + x\n", + "\n", + " @register(dispatch_mode={\"dispatch_fn\": two_to_all_dispatch_fn, \"collect_fn\": collect_all_to_all})\n", + " def foo_custom(self, x, y):\n", + " return self._x + y + x" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "id": "83ec6609", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)\n", + "worker_group = RayWorkerGroup(resource_pool, class_with_args)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "id": "62c58d8a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "output_ref = worker_group.foo_custom(x=[1, 2], y=[5, 6])\n", + "assert output_ref == [8, 10, 8, 10]\n", + "\n", + "output_ref = worker_group.foo_rank_zero(x=1, y=2)\n", + "assert output_ref == 5" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "id": "14689353", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8\n" + ] + } + ], + "source": [ + "print(gpu_accumulator_decorator.world_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "2c80bbf4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Shutdown ray cluster\n", + "ray.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "a5c8151c", + "metadata": {}, + "source": [ + "## Chapter 4: NVMegatronRayWorkerGroup" + ] + }, + { + "cell_type": "markdown", + "id": "cd5680e9", + "metadata": {}, + "source": [ + "Due to the Ray issue, we can only support max_colocate_count=1 in RayResourcePool for now. \n", + "This means that each GPU can only have one process.\n", + "We can support max_colocate > 1 when applying this pull request: https://github.com/ray-project/ray/pull/44385" + ] + }, + { + "cell_type": "markdown", + "id": "92724419", + "metadata": {}, + "source": [ + "Therefore, we need to restart the ray and initialize a new resource_pool to demonstrate the **NVMegatronRayWorkerGroup**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b038538", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Build a local ray cluster. The head node and worker node are on this machine\n", + "ray.init()" + ] + }, + { + "cell_type": "markdown", + "id": "ebfd8798", + "metadata": {}, + "source": [ + "Finally, we implement a `NVMegatronRayWorkerGroup`, within which we create a Megatron and then run a tensor parallel (tp) split Llama mlp layer. Here, we use a complex dispatch mode, `Megatron_COMPUTE`. This dispatch mode assumes that user passes the data partitioned by DP dimension. The data is dispatched to all tp/pp ranks within the same dp group, and ultimately only collects output data from tp=0 and the last pp. In this way, for users that only write code on the driver, the Megatron behind the RPC becomes transparent." + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "5a032154", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/tiger/Megatron-LM\n", + "/opt/tiger/Megatron-LM/megatron/__init__.py\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "current_pythonpath = os.environ.get(\"PYTHONPATH\", \"\")\n", + "\n", + "new_path = \"/opt/tiger/Megatron-LM\"\n", + "\n", + "new_pythonpath = f\"{new_path}:{current_pythonpath}\" if current_pythonpath else new_path\n", + "\n", + "os.environ[\"PYTHONPATH\"] = new_pythonpath\n", + "\n", + "print(new_path)\n", + "sys.path.append(new_path)\n", + "\n", + "import megatron\n", + "\n", + "print(megatron.__file__)" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "id": "8c84cd5a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from megatron.core import parallel_state as mpu\n", + "from omegaconf import OmegaConf\n", + "\n", + "from verl.single_controller.base.decorator import Dispatch, Execute, register\n", + "from verl.single_controller.base.megatron.worker import MegatronWorker\n", + "from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup\n", + "from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "id": "1b1debcc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "resource_pool = RayResourcePool([4], use_gpu=True, max_colocate_count=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "id": "bccbe081", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "@ray.remote\n", + "class MLPLayerWorker(MegatronWorker):\n", + " def __init__(self):\n", + " super().__init__()\n", + " rank = int(os.environ[\"LOCAL_RANK\"])\n", + " torch.distributed.init_process_group(backend=\"nccl\")\n", + " torch.cuda.set_device(rank)\n", + "\n", + " mpu.initialize_model_parallel(\n", + " tensor_model_parallel_size=4,\n", + " pipeline_model_parallel_size=1,\n", + " virtual_pipeline_model_parallel_size=None,\n", + " pipeline_model_parallel_split_rank=None,\n", + " use_sharp=False,\n", + " context_parallel_size=1,\n", + " expert_model_parallel_size=1,\n", + " nccl_communicator_config_path=None,\n", + " )\n", + " from megatron.core import tensor_parallel\n", + "\n", + " tensor_parallel.model_parallel_cuda_manual_seed(10)\n", + "\n", + " @register(Dispatch.ONE_TO_ALL)\n", + " def init_model(self, config):\n", + " from omegaconf import OmegaConf\n", + "\n", + " from verl.models.llama.megatron.layers import ParallelLlamaMLP\n", + " from verl.utils.megatron_utils import init_model_parallel_config\n", + "\n", + " megatron_config = OmegaConf.create(\n", + " {\n", + " \"sequence_parallel\": False,\n", + " \"param_dtype\": \"fp32\",\n", + " \"tensor_model_parallel_size\": mpu.get_tensor_model_parallel_world_size(),\n", + " \"pipeline_model_parallel_rank\": mpu.get_pipeline_model_parallel_rank(),\n", + " \"pipeline_model_parallel_size\": mpu.get_pipeline_model_parallel_world_size(),\n", + " \"virtual_pipeline_model_parallel_rank\": mpu.get_virtual_pipeline_model_parallel_rank(),\n", + " \"virtual_pipeline_model_parallel_size\": mpu.get_virtual_pipeline_model_parallel_world_size(),\n", + " }\n", + " )\n", + "\n", + " megatron_config = init_model_parallel_config(megatron_config)\n", + " self.parallel_layer = ParallelLlamaMLP(config=config, megatron_config=megatron_config)\n", + "\n", + " @register(Dispatch.ONE_TO_ALL)\n", + " def get_weights(self):\n", + " output = {}\n", + " for key, val in self.parallel_layer.named_parameters():\n", + " output[key] = val\n", + " return output\n", + "\n", + " @register(Dispatch.MEGATRON_COMPUTE)\n", + " def run_layer(self, x):\n", + " x = x.to(\"cuda\")\n", + " y = self.parallel_layer(x)\n", + " return y" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "id": "a655271d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "layer_cls = RayClassWithInitArgs(cls=MLPLayerWorker)\n", + "layer_worker_group = NVMegatronRayWorkerGroup(\n", + " resource_pool=resource_pool,\n", + " ray_cls_with_init=layer_cls,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "f105ebee", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 4 1 1\n" + ] + } + ], + "source": [ + "print(layer_worker_group.world_size, layer_worker_group.tp_size, layer_worker_group.pp_size, layer_worker_group.dp_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "id": "38655091", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ffn_hidden_size = 11008\n", + "batch_size = 16\n", + "seq_len = 2048\n", + "hidden_size = 4096\n", + "\n", + "config = OmegaConf.create(\n", + " {\n", + " \"hidden_size\": hidden_size,\n", + " \"intermediate_size\": ffn_hidden_size,\n", + " \"hidden_act\": \"silu\",\n", + " \"pretraining_tp\": 1,\n", + " \"tp\": layer_worker_group.tp_size,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "id": "a026efca", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "x = torch.rand(size=(seq_len, batch_size, hidden_size), dtype=torch.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "id": "f5fcaf13", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[None, None, None, None]" + ] + }, + "execution_count": 179, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "layer_worker_group.init_model(config)" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "id": "3f5cc9b4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2048, 16, 4096])\n" + ] + } + ], + "source": [ + "output = layer_worker_group.run_layer(\n", + " [x]\n", + ") # This must be a list of size 1, ensuring that the input equals the data parallel (dp).\n", + "print(output[0].shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "id": "49792210", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Shutdown ray cluster\n", + "ray.shutdown()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/code/RL_model/verl/verl_train/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh b/code/RL_model/verl/verl_train/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh new file mode 100644 index 0000000000000000000000000000000000000000..3e1de4af113eeb25013f396a8fd78cca56081231 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh @@ -0,0 +1,49 @@ +set -x + + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=reinforce_plus_plus \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=3e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=1024 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=mse \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=True \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_gsm8k' \ + trainer.experiment_name='qwen2_7b_function_rm' \ + trainer.n_gpus_per_node=16 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh b/code/RL_model/verl/verl_train/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh new file mode 100644 index 0000000000000000000000000000000000000000..fb827168a19aa2e929fc3af7b2e3c87b22c52295 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh @@ -0,0 +1,49 @@ +set -x + + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=reinforce_plus_plus_baseline \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=3e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=1024 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=mse \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=True \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_grpo_example_gsm8k' \ + trainer.experiment_name='qwen2_7b_function_rm' \ + trainer.n_gpus_per_node=16 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh b/code/RL_model/verl/verl_train/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh new file mode 100644 index 0000000000000000000000000000000000000000..feebe8a847594671fe7c8a9d2468c52eaaf33cac --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh @@ -0,0 +1,43 @@ +set -x + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=remax \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=512 \ + data.max_prompt_length=512 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=128 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=30000 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.rollout.n=4 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=True \ + algorithm.kl_penalty=kl \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_remax_example_gsm8k' \ + trainer.experiment_name='qwen2.5_3b_function_rm_kl1e-3' \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=5 $@ diff --git a/code/RL_model/verl/verl_train/examples/remax_trainer/run_qwen2.5-7b_seq_balance.sh b/code/RL_model/verl/verl_train/examples/remax_trainer/run_qwen2.5-7b_seq_balance.sh new file mode 100644 index 0000000000000000000000000000000000000000..8734eb351319f88417c767aad670052ee4b113a4 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/remax_trainer/run_qwen2.5-7b_seq_balance.sh @@ -0,0 +1,43 @@ +set -x + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=remax \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.rollout.n=4 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=True \ + algorithm.kl_penalty=kl \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_remax_example_gsm8k' \ + trainer.experiment_name='qwen2.5_7b_function_rm_kl1e-3' \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=10 $@ diff --git a/code/RL_model/verl/verl_train/examples/rloo_trainer/run_qwen2-7b.sh b/code/RL_model/verl/verl_train/examples/rloo_trainer/run_qwen2-7b.sh new file mode 100644 index 0000000000000000000000000000000000000000..fc9b6e29fdebd0245f7ecf6cf42d9b369e8fa1db --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/rloo_trainer/run_qwen2-7b.sh @@ -0,0 +1,40 @@ +set -x + + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=rloo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=True \ + algorithm.kl_penalty=kl \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_rloo_example_gsm8k' \ + trainer.experiment_name='qwen2_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/rollout_correction/run_with_rollout_corr.sh b/code/RL_model/verl/verl_train/examples/rollout_correction/run_with_rollout_corr.sh new file mode 100644 index 0000000000000000000000000000000000000000..7e763b02a95e0b2f26f63d910908bab16f0c3c43 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/rollout_correction/run_with_rollout_corr.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# Example: RLOO (REINFORCE Leave-One-Out) with Rollout Correction +# This demonstrates self-normalized sequence-level IS with pure policy gradient +# +# References: +# - Rollout Correction Docs: https://github.com/volcengine/verl/blob/main/docs/algo/rollout_corr.md +# - Rollout Correction Math: https://github.com/volcengine/verl/blob/main/docs/algo/rollout_corr_math.md + +set -xeuo pipefail + +# ============================================================================== +# Rollout Correction Configuration (RLOO) +# ============================================================================== + +# Importance Sampling (IS) weights configuration +rollout_is="sequence" # Self-normalized sequence-level IS +rollout_is_threshold=2.0 # Upper threshold for IS weights +rollout_is_batch_normalize="true" # Self-normalization (mean=1.0) + +# Rejection Sampling (RS) configuration +rollout_rs="null" # No rejection sampling for basic RLOO +rollout_rs_threshold="null" # RS threshold spec (string or float) + +# Bypass mode with REINFORCE loss (no PPO clipping) +bypass_mode="true" # Skip old_log_prob computation +loss_type="reinforce" # REINFORCE with explicit IS weights (alternative: "ppo_clip") + +# ============================================================================== +# Model and Data Configuration +# ============================================================================== + +MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-7B"} +TRAIN_FILE=${TRAIN_FILE:-"data/train.parquet"} +TEST_FILE=${TEST_FILE:-"data/test.parquet"} + +max_prompt_length=2048 +max_response_length=4096 + +# ============================================================================== +# Training Configuration +# ============================================================================== + +train_batch_size=128 +ppo_mini_batch_size=32 +ppo_epochs=1 +learning_rate=5e-7 + +# ============================================================================== +# Algorithm Configuration (RLOO) +# ============================================================================== + +adv_estimator=rloo # RLOO advantage estimator +gamma=1.0 + +# ============================================================================== +# Launch Training +# ============================================================================== + +python3 -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_batch_size} \ + data.truncation='left' \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.gamma=${gamma} \ + algorithm.rollout_correction.rollout_is=${rollout_is} \ + algorithm.rollout_correction.rollout_is_threshold=${rollout_is_threshold} \ + algorithm.rollout_correction.rollout_is_batch_normalize=${rollout_is_batch_normalize} \ + algorithm.rollout_correction.rollout_rs=${rollout_rs} \ + algorithm.rollout_correction.rollout_rs_threshold=${rollout_rs_threshold} \ + algorithm.rollout_correction.bypass_mode=${bypass_mode} \ + algorithm.rollout_correction.loss_type=${loss_type} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=${learning_rate} \ + actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.actor.ppo_epochs=${ppo_epochs} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.name=vllm \ + trainer.logger='["console","wandb"]' \ + trainer.project_name="rollout_corr_rloo_example" \ + trainer.experiment_name="rloo_seq_is_pure" \ + trainer.total_epochs=10 + +echo "Training completed!" +echo "" +echo "RLOO Configuration:" +echo " - Algorithm: RLOO (REINFORCE Leave-One-Out)" +echo " - Advantage estimator: ${adv_estimator}" +echo " - IS mode: ${rollout_is} (self-normalized: ${rollout_is_batch_normalize})" +echo " - IS threshold: ${rollout_is_threshold}" +echo " - Bypass mode: ${bypass_mode}, loss_type: ${loss_type}" +echo "" +echo "Monitor these key metrics in wandb:" +echo " - rollout_corr/rollout_is_mean (should be ~1.0 before batch norm)" +echo " - rollout_corr/rollout_is_batch_norm_factor (normalization factor applied)" +echo " - rollout_corr/rollout_is_eff_sample_size (should be >0.5)" diff --git a/code/RL_model/verl/verl_train/examples/rollout_correction/run_with_rollout_corr_multi_rs.sh b/code/RL_model/verl/verl_train/examples/rollout_correction/run_with_rollout_corr_multi_rs.sh new file mode 100644 index 0000000000000000000000000000000000000000..d2168413e57b971f0d9cf0d6286f316e0ea6648d --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/rollout_correction/run_with_rollout_corr_multi_rs.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Example: PPO-clip with Rollout Correction using multiple RS criteria +# Demonstrates chaining token-level and sequence-level rejection sampling +# (token_k1 + seq_max_k2) alongside optional IS metrics. +# +# References: +# - Rollout Correction Docs: https://github.com/volcengine/verl/blob/main/docs/algo/rollout_corr.md +# - Rollout Correction Math: https://github.com/volcengine/verl/blob/main/docs/algo/rollout_corr_math.md + +set -xeuo pipefail + +# ============================================================================== +# Rollout Correction Configuration (PPO-clip + multi RS) +# ============================================================================== + +# Importance Sampling (IS) weights configuration +rollout_is="token" # Token-level IS for metrics/analysis +rollout_is_threshold=2.0 # Upper threshold for IS weights +rollout_is_batch_normalize="false" # Keep raw truncated weights + +# Rejection Sampling (RS) configuration (multi-criteria) +# - token_k1 keeps per-token ratios inside [lower, upper] +# - seq_max_k2 rejects sequences with extreme chi-square spikes +rollout_rs="token_k1,seq_max_k2" +rollout_rs_threshold="0.6_1.6,2.5" + +# Bypass PPO mode (reuse rollout_log_prob) +bypass_mode="true" +loss_type="ppo_clip" + +# ============================================================================== +# Model and Data Configuration +# ============================================================================== + +MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-7B"} +TRAIN_FILE=${TRAIN_FILE:-"data/train.parquet"} +TEST_FILE=${TEST_FILE:-"data/test.parquet"} + +max_prompt_length=2048 +max_response_length=4096 + +# ============================================================================== +# Training Configuration +# ============================================================================== + +train_batch_size=128 +ppo_mini_batch_size=32 +ppo_epochs=1 +learning_rate=3e-6 + +# ============================================================================== +# Algorithm Configuration +# ============================================================================== + +adv_estimator=grpo +gamma=1.0 + +# ============================================================================== +# Launch Training +# ============================================================================== + +python3 -m verl.trainer.main_ppo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_batch_size} \ + data.truncation='left' \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.gamma=${gamma} \ + algorithm.rollout_correction.rollout_is=${rollout_is} \ + algorithm.rollout_correction.rollout_is_threshold=${rollout_is_threshold} \ + algorithm.rollout_correction.rollout_is_batch_normalize=${rollout_is_batch_normalize} \ + algorithm.rollout_correction.rollout_rs=\'${rollout_rs}\' \ + algorithm.rollout_correction.rollout_rs_threshold=\'${rollout_rs_threshold}\' \ + algorithm.rollout_correction.bypass_mode=${bypass_mode} \ + algorithm.rollout_correction.loss_type=${loss_type} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=${learning_rate} \ + actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.actor.ppo_epochs=${ppo_epochs} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.name=vllm \ + trainer.logger='["console","wandb"]' \ + trainer.project_name="rollout_corr_multi_rs_example" \ + trainer.experiment_name="ppo_clip_multi_rs" \ + trainer.total_epochs=5 + +echo "Training completed!" +echo "" +echo "Multi-RS Configuration:" +echo " - rollout_is: ${rollout_is} (threshold=${rollout_is_threshold}, batch_norm=${rollout_is_batch_normalize})" +echo " - rollout_rs: ${rollout_rs}" +echo " - rollout_rs_threshold: ${rollout_rs_threshold}" +echo " - bypass_mode: ${bypass_mode}, loss_type: ${loss_type}" +echo "" +echo "Track these metrics in wandb:" +echo " - rollout_corr/rollout_rs_token_k1_mean" +echo " - rollout_corr/rollout_rs_seq_max_k2_mean" +echo " - rollout_corr/rollout_rs_masked_fraction" diff --git a/code/RL_model/verl/verl_train/examples/router_replay/README.md b/code/RL_model/verl/verl_train/examples/router_replay/README.md new file mode 100644 index 0000000000000000000000000000000000000000..93006431ee2be922b9b61051ad662ace9e542a08 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/router_replay/README.md @@ -0,0 +1,71 @@ +# Router Replay + +Router Replay is an advanced routing replay functionality within the Verl framework designed for Mixture of Experts (MoE) models. It enables deterministic training by recording and replaying routing decisions, ensuring consistent model behavior across training runs. + + +## Key Features + +### Multiple Operating Modes +- **`disabled`**: Router replay functionality is completely disabled +- **`R2`**: Standard router replay mode for recording and replaying routing decisions +- **`R3`**: Rollout-specific router replay mode optimized for reinforcement learning workflows + +### Core Capabilities +- **Seamless Integration**: Works with reinforcement learning pipelines including PPO +- **Distributed Training Support**: Compatible with multi-GPU and multi-node training environments +- **Flexible Configuration**: Easy to configure via YAML files or command-line parameters + +## Configuration + +### RouterReplayConfig Parameters + +```yaml +router_replay: + mode: "disabled" # Available options: disabled, R2, R3 + record_file: null # Path for recording routing decisions + replay_file: null # Path for replaying recorded decisions +``` + +## Quick Start Guide + +### Enabling R2 Mode + +#### Configuration File Method +Add the following to your training configuration: + +```yaml +actor: + router_replay: + mode: "R2" +``` + +#### Command Line Method +Enable R2 mode via command-line parameters: + +```bash +actor_rollout_ref.actor.router_replay.mode="R2" +``` + +### Enabling R3 Mode + +#### Configuration File Method +Configure both actor and rollout settings: + +```yaml +# Actor configuration +router_replay: + mode: "R3" + +# Rollout configuration +enable_rollout_routing_replay: True +``` + +#### Command Line Method +Enable R3 mode via command-line parameters: + +```bash +actor_rollout_ref.actor.router_replay.mode="R3" +actor_rollout_ref.rollout.enable_rollout_routing_replay=True +``` + +R3 mode requires the rollout backend to support returning router selection results. Currently, this functionality is being tested based on the vllm implementation at https://github.com/vllm-project/vllm/pull/28284 as well as bug fix at https://github.com/vllm-project/vllm/pull/33013 and SGLang implementation at https://github.com/sgl-project/sglang/commit/bed301a5acaa9577c9aa706468bdf242f6a43051. diff --git a/code/RL_model/verl/verl_train/examples/router_replay/run_qwen30_a3b_megatron_sglang.sh b/code/RL_model/verl/verl_train/examples/router_replay/run_qwen30_a3b_megatron_sglang.sh new file mode 100644 index 0000000000000000000000000000000000000000..e19a50a4214e01844af89be9b1e516b0e1a13339 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/router_replay/run_qwen30_a3b_megatron_sglang.sh @@ -0,0 +1,110 @@ + +set -x + +NODES=6 + +# R2: enable routing replay +# R3: enable rollout routing replay +# If enabling R3, please set actor_rollout_ref.rollout.enable_rollout_routing_replay=True +# R3 example is based on SGLang related commit https://github.com/sgl-project/sglang/commit/bed301a5acaa9577c9aa706468bdf242f6a43051 + +ROUTING_REPLAY_MODE="R3" + +DIST_CKPT_PATH="" +HF_MODEL_PATH="" +TRAIN_DATA_PATH="" +TEST_DATA_PATH="" + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping +PP=6 +VPP=None +TP=1 +EP=8 +ETP=1 +SGLANG_INFER_TP=4 +offload=True +gpu_memory_utilization=0.65 +bs=3 +micro_bs=3 +use_dynamic_bsz=False +max_prompt_length=512 +max_response_length=512 +ppo_mini_batch_size=3 +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) + + +exper_name=Node${NODES}_bs${bs}_${PP}${TP}${EP}${ETP}_${SGLANG_INFER_TP}_minbs${ppo_mini_batch_size}_micro_bs${micro_bs} + +python3 -m verl.trainer.main_ppo --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_DATA_PATH \ + data.val_files=$TEST_DATA_PATH \ + data.train_batch_size=$bs \ + data.max_prompt_length=$max_prompt_length \ + data.max_response_length=$max_response_length \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.use_fused_kernels=True \ + actor_rollout_ref.model.path=$HF_MODEL_PATH \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.actor.router_replay.mode=${ROUTING_REPLAY_MODE} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=False \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$micro_bs \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$micro_bs \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$SGLANG_INFER_TP \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.enable_rollout_routing_replay=True \ + actor_rollout_ref.rollout.skip_tokenizer_init=True \ + actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.actor.megatron.use_mbridge=True \ + actor_rollout_ref.rollout.gpu_memory_utilization=$gpu_memory_utilization \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=$micro_bs \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=['console'] \ + trainer.project_name='verl_grpo_example_gsm8k_math' \ + trainer.experiment_name="$exper_name" \ + trainer.nnodes=$NODES \ + trainer.n_gpus_per_node=8 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_training_steps=50000 \ + trainer.balance_batch=False \ + trainer.val_before_train=False 2>&1 diff --git a/code/RL_model/verl/verl_train/examples/router_replay/run_qwen30_a3b_megatron_vllm.sh b/code/RL_model/verl/verl_train/examples/router_replay/run_qwen30_a3b_megatron_vllm.sh new file mode 100644 index 0000000000000000000000000000000000000000..74e7af0dee0455c458c7aef86671bcaef525d08a --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/router_replay/run_qwen30_a3b_megatron_vllm.sh @@ -0,0 +1,110 @@ + +set -x + +NODES=1 + +# R2: enable routing replay +# R3: enable rollout routing replay +# If enabling R3, please set actor_rollout_ref.rollout.enable_rollout_routing_replay=True +# R3 example is based on vllm related pr: +# - https://github.com/vllm-project/vllm/pull/28284 +# - https://github.com/vllm-project/vllm/pull/33013 + +ROUTING_REPLAY_MODE="R2" + +DIST_CKPT_PATH="" +HF_MODEL_PATH="" +TRAIN_DATA_PATH="" +TEST_DATA_PATH="" + +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping +PP=1 +VPP=None +TP=2 +EP=8 +ETP=1 +VLLM_INFER_TP=2 +offload=True +gpu_memory_utilization=0.65 +bs=8 +micro_bs=3 +use_dynamic_bsz=True +max_prompt_length=1024 +max_response_length=1024 +ppo_mini_batch_size=8 +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) + + +exper_name=Node${NODES}_bs${bs}_${PP}${TP}${EP}${ETP}_${VLLM_INFER_TP}_minbs${ppo_mini_batch_size}_micro_bs${micro_bs} + +python3 -m verl.trainer.main_ppo --config-path=config \ + --config-name='ppo_megatron_trainer.yaml' \ + algorithm.adv_estimator=grpo \ + data.train_files=$TRAIN_DATA_PATH \ + data.val_files=$TEST_DATA_PATH \ + data.train_batch_size=$bs \ + data.max_prompt_length=$max_prompt_length \ + data.max_response_length=$max_response_length \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.use_fused_kernels=True \ + actor_rollout_ref.model.path=$HF_MODEL_PATH \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.actor.router_replay.mode=${ROUTING_REPLAY_MODE} \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ + actor_rollout_ref.actor.megatron.param_offload=${offload} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$micro_bs \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \ + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$micro_bs \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_INFER_TP \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.actor.megatron.use_mbridge=True \ + actor_rollout_ref.rollout.gpu_memory_utilization=$gpu_memory_utilization \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=$micro_bs \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \ + actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \ + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \ + actor_rollout_ref.ref.megatron.param_offload=${offload} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=['console'] \ + trainer.project_name='verl_grpo_example_gsm8k_math' \ + trainer.experiment_name="$exper_name" \ + trainer.nnodes=$NODES \ + trainer.n_gpus_per_node=8 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_training_steps=50000 \ + trainer.balance_batch=False \ + trainer.val_before_train=False 2>&1 diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/README.md b/code/RL_model/verl/verl_train/examples/sglang_multiturn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c97c7e7507f3b5b108128c7068ea9ae6dae95ee --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/README.md @@ -0,0 +1,38 @@ +# Multi-Turn Rollout Example (GSM8K) + +This example demonstrates how to perform **multi-turn rollout** using SGLang with a tool-calling capable model (e.g., Qwen2.5-3B) on the GSM8K dataset. + +## Usage + +### Step 1: Download GSM8K Dataset + +```bash +cd examples/data_preprocess +python3 gsm8k_multiturn_w_tool.py +``` + +This will download and preprocess the GSM8K dataset into ~/data/gsm8k/. + +### Step 2: Run Multi-Turn Rollout + +If you have 8 GPUs +Use the standard 8-GPU script: + +```bash +cd your_verl_root_dir +bash examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh +``` + +If you have only 4 GPUs +Use the fallback 4-GPU script: + +```bash +cd your_verl_root_dir +bash examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh +``` + +## Notes + +- The rollout supports multi-turn conversations with tool-calling capabilities. +- Current tools are used for GSM8K answer evaluation. +- Future versions may extend to search and code interpreter tools. diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen0.5b_gsm8k_multiturn_curriculum.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen0.5b_gsm8k_multiturn_curriculum.sh new file mode 100644 index 0000000000000000000000000000000000000000..d67a76e48fe12f3463cbc0c870c3fec3511ab7c8 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen0.5b_gsm8k_multiturn_curriculum.sh @@ -0,0 +1,56 @@ +# run on 8xH100 +# make sure your current working directory is the root of the project + +set -x + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo' \ + algorithm.adv_estimator=grpo \ + data.sampler.class_name="RandomCurriculumSampler" \ + data.sampler.class_path="pkg://tests.utils.dataset.test_create_rl_sampler_on_cpu" \ + data.dataloader_num_workers=0 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.train_batch_size=256 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='gsm8k_async_rl' \ + trainer.experiment_name='qwen3-4b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + trainer.total_epochs=15 $@ + diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh new file mode 100644 index 0000000000000000000000000000000000000000..4cf04ee616b4c28d44733c1f8cf9270002e96ee1 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-0.5b_gsm8k_multiturn_w_interaction.sh @@ -0,0 +1,58 @@ +# run on 8xH100 +# make sure your current working directory is the root of the project + +set -x + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" +TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-512} +MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-8} +OFFLOAD=${OFFLOAD:-False} + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo_w_interaction' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=$TRAIN_BATCH_SIZE \ + data.max_prompt_length=1024 \ + data.max_response_length=$((1024 * 3)) \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.model.enable_activation_offload=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=$TRAIN_BATCH_SIZE \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.fsdp_config.param_offload=$OFFLOAD \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=$OFFLOAD \ + actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=$MICRO_BATCH_SIZE \ + actor_rollout_ref.ref.fsdp_config.param_offload=$OFFLOAD \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='gsm8k_async_rl' \ + trainer.experiment_name='qwen2.5-0.5b_function_rm-gsm8k-sgl-multi-w-interaction-n8' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + data.train_files=$HOME/data/gsm8k_verl_sgl_multi_turn_w_interaction/train.parquet \ + data.val_files=$HOME/data/gsm8k_verl_sgl_multi_turn_w_interaction/test.parquet \ + actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \ + trainer.total_epochs=15 $@ + diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh new file mode 100644 index 0000000000000000000000000000000000000000..a2d17d45ad43a861a70e3a2813681625e701c062 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh @@ -0,0 +1,67 @@ +# run on 8xH100 +# make sure your current working directory is the root of the project + +set -x + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +function now() { + date '+%d-%H-%M' +} + +EXPERIMENT_NAME="qwen2.5-3b_baseline_$(now)" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + global_profiler.tool=torch_memory \ + global_profiler.save_path=./mem_snapshots \ + global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries=100000 \ + global_profiler.global_tool_config.torch_memory.stack_depth=32 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \ + actor_rollout_ref.rollout.multi_stage_wake_up=True \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.rollout.over_sample_rate=0.1 \ + actor_rollout_ref.rollout.mode=async \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='multi-turn-grpo-qwen2.5-3b-sglang' \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + trainer.val_before_train=True \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + trainer.total_epochs=15 $@ + diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..9e61893b05393c28f314416b9250703883df34f3 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu.sh @@ -0,0 +1,60 @@ +# run on 4xH100 +# make sure your current working directory is the root of the project + +set -x +export HYDRA_FULL_ERROR=1 +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='gsm8k_async_rl' \ + trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-async-sgl-multi-w-tool-verify-n16-4cards' \ + trainer.n_gpus_per_node=4 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + trainer.total_epochs=15 \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=8192 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=8192 \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192 \ + critic.ppo_max_token_len_per_gpu=8192 \ + critic.forward_max_token_len_per_gpu=8192 \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \ + actor_rollout_ref.rollout.multi_turn.max_user_turns=1 \ + $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu_server.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu_server.sh new file mode 100644 index 0000000000000000000000000000000000000000..79e5e568e76f923595847bb1048323e9f382b654 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_4xgpu_server.sh @@ -0,0 +1,60 @@ +# run on 4xH100 +# make sure your current working directory is the root of the project + +set -x +export HYDRA_FULL_ERROR=1 +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo_server' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console", "wandb"]' \ + trainer.project_name='gsm8k_async_rl_server' \ + trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-async-sgl-multi-w-tool-verify-n16-4cards' \ + trainer.n_gpus_per_node=4 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + trainer.total_epochs=15 \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=8192 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=8192 \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192 \ + critic.ppo_max_token_len_per_gpu=8192 \ + critic.forward_max_token_len_per_gpu=8192 \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \ + actor_rollout_ref.rollout.multi_turn.max_user_turns=1 \ + $@ \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_server.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_server.sh new file mode 100644 index 0000000000000000000000000000000000000000..17f2ed40b8a2ec607019490f5b1d45c1c4e8aea7 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_server.sh @@ -0,0 +1,62 @@ +# run on 8xH100 +# make sure your current working directory is the root of the project + +set -x + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +function now() { + date '+%d-%H-%M' +} + +EXPERIMENT_NAME="qwen2.5-3b_baseline_$(now)" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo_server' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \ + actor_rollout_ref.rollout.multi_stage_wake_up=True \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.rollout.over_sample_rate=0 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='multi-turn-grpo-qwen2.5-3b-sglang' \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + trainer.val_before_train=True \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + trainer.total_epochs=15 $@ + diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh new file mode 100644 index 0000000000000000000000000000000000000000..c3c40b1076c2e9d2deb63af05564915b467ba109 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh @@ -0,0 +1,59 @@ +# run on Ascend 910 +# make sure your current working directory is the root of the project + +set -x +ulimit -n 65535 + +#set vllm v1 env +export VLLM_USE_V1=1 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +TRAIN_BATCH_SIZE=32 +MICRO_BATCH_SIZE=8 + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo' \ + actor_rollout_ref.rollout.name=vllm \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="Qwen/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${TRAIN_BATCH_SIZE} \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.9\ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.project_name='gsm8k_async_rl' \ + trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \ + trainer.n_gpus_per_node=16 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + trainer.logger='["console"]' \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + trainer.total_epochs=15 \ + actor_rollout_ref.rollout.trace.token2text=False \ + actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.rollout.multi_turn.enable=true \ + actor_rollout_ref.rollout.enforce_eager=True \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + actor_rollout_ref.rollout.free_cache_engine=True \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh new file mode 100644 index 0000000000000000000000000000000000000000..11c104fa94f4b19657149e2018da0a1321831083 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh @@ -0,0 +1,57 @@ +# run on 8xH100 +# make sure your current working directory is the root of the project + +set -x + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.rollout.trace.backend=mlflow \ + actor_rollout_ref.rollout.trace.token2text=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","mlflow"]' \ + trainer.project_name='gsm8k_tool-agent' \ + trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-tool-agent-verify-n16' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + trainer.total_training_steps=2 \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + trainer.total_epochs=15 $@ + diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh new file mode 100644 index 0000000000000000000000000000000000000000..5522ee9250986ca0058e86c8438c03d81c3bac90 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen2.5-3b_megatron_gsm8k_multiturn.sh @@ -0,0 +1,64 @@ +# run on 8xH100 +# make sure your current working directory is the root of the project +# this is a verification training script, the parallel setting should be tuned to your model + +set -x + +export PYTHONUNBUFFERED=1 +export RAY_DEDUP_LOGS=0 +export RUST_BACKTRACE=1 +export HYDRA_FULL_ERROR=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_megatron_grpo' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=/user/longxiang1/models/Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \ + actor_rollout_ref.actor.megatron.context_parallel_size=2 \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.megatron.seed=42 \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \ + actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \ + actor_rollout_ref.ref.megatron.context_parallel_size=2 \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='gsm8k_async_rl' \ + trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-n8-mcore-v2505201745_seed42' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + data.train_files=/user/longxiang1/data/gsm8k_verl_sgl_multi_turn_preprocessed_v2/train.parquet \ + data.val_files=/user/longxiang1/data/gsm8k_verl_sgl_multi_turn_preprocessed_v2/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + trainer.total_epochs=15 $@ + diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh new file mode 100644 index 0000000000000000000000000000000000000000..fc56ed209826de3fac78b828fa9af236f1102647 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh @@ -0,0 +1,55 @@ +# run on 8xH100 +# make sure your current working directory is the root of the project + +set -x + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=256 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path=Qwen/Qwen3-4B \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.over_sample_rate=0.1 \ + actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='gsm8k_async_rl' \ + trainer.experiment_name='qwen3-4b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + trainer.total_epochs=15 $@ + diff --git a/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh new file mode 100644 index 0000000000000000000000000000000000000000..d1c78aa859be6ea81b912129f8027d893b98bbea --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh @@ -0,0 +1,100 @@ +set -x + +ulimit -n 65535 + +PROJECT_DIR="$(pwd)" +CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + +pip install --upgrade "huggingface-hub>=0.34.0" +hf download \ + BytedTsinghua-SIA/DAPO-Math-17k \ + --repo-type dataset \ + --local-dir $HOME/data/BytedTsinghua-SIA/DAPO-Math-17k + + +hf download \ + Maxwell-Jia/AIME_2024 \ + --repo-type dataset \ + --local-dir $HOME/data/Maxwell-Jia/AIME_2024 + + +# Note: +# 1. +# a sandbox fusion server is needed to run the code interpreter tool. +# docker run -it -p 8080:8080 volcengine/sandbox-fusion:server-20250609 + +# 2. +# The model located at font-info/qwen3-4b-sft-SGLang-RL (https://huggingface.co/font-info/qwen3-4b-sft-SGLang-RL) +# is a fine-tuned version provided by the SGLang RL team. Without supervised fine-tuning (SFT) +# on the Retool dataset, Dapo training will not converge. + +# If you still wish to perform SFT from scratch, follow the steps below: + +# Step 1: Download the SFT dataset +#hf download JoeYing/ReTool-SFT --repo-type dataset --local-dir ./ReTool-SFT + +# Step 2: Preprocess the data for SFT +#python3 recipe/retool/retool_sft_preprocess.py + +# Step 3: Run SFT training +#bash recipe/retool/run_qwen2-32b_sft.sh + +# having trouble setup? see https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/release_log/latest_sglang.md for more details. + + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + algorithm.use_kl_in_reward=False \ + algorithm.kl_ctrl.kl_coef=0.0 \ + data.train_files=$HOME/data/BytedTsinghua-SIA/DAPO-Math-17k \ + data.val_files=$HOME/data/Maxwell-Jia/AIME_2024 \ + data.return_raw_chat=True \ + data.train_batch_size=32 \ + data.max_prompt_length=2048 \ + data.max_response_length=16384 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.custom_cls.path=$PROJECT_DIR/recipe/retool/retool.py \ + data.custom_cls.name=CustomRLHFDataset \ + custom_reward_function.path=$PROJECT_DIR/recipe/retool/retool.py \ + custom_reward_function.name=compute_score \ + actor_rollout_ref.model.path=font-info/qwen3-4b-sft-SGLang-RL \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.0 \ + actor_rollout_ref.actor.clip_ratio_low=0.2 \ + actor_rollout_ref.actor.clip_ratio_high=0.28 \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.use_dynamic_bsz=False \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \ + actor_rollout_ref.rollout.multi_stage_wake_up=True \ + actor_rollout_ref.rollout.multi_turn.enable=True \ + actor_rollout_ref.rollout.multi_turn.max_user_turns=16 \ + actor_rollout_ref.rollout.multi_turn.max_assistant_turns=16 \ + actor_rollout_ref.rollout.multi_turn.tool_config_path=$PROJECT_DIR/recipe/retool/sandbox_fusion_tool_config.yaml \ + actor_rollout_ref.rollout.multi_turn.format=hermes \ + actor_rollout_ref.rollout.n=8 \ + actor_rollout_ref.rollout.val_kwargs.top_p=0.6 \ + actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \ + actor_rollout_ref.rollout.val_kwargs.n=30 \ + trainer.logger=['console','wandb'] \ + trainer.project_name=sglang-dapo-multiturn \ + trainer.experiment_name=qwen3_4b_sft_dapo_multiturn \ + trainer.n_gpus_per_node=8 \ + trainer.log_val_generations=20 \ + trainer.val_before_train=True \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=20 \ + trainer.total_epochs=15 \ + $@ diff --git a/code/RL_model/verl/verl_train/examples/skypilot/README.md b/code/RL_model/verl/verl_train/examples/skypilot/README.md new file mode 100644 index 0000000000000000000000000000000000000000..78bd8458a83914a75c096dda8ef6e81e519981f1 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/skypilot/README.md @@ -0,0 +1,107 @@ +# verl with SkyPilot + +Run verl reinforcement learning training jobs on Kubernetes clusters or cloud platforms with GPU nodes using [SkyPilot](https://github.com/skypilot-org/skypilot). + +## Installation and Configuration + +### Step 1: Install SkyPilot + +Choose the installation based on your target platform: + +```bash +# For Kubernetes only +pip install "skypilot[kubernetes]" + +# For AWS +pip install "skypilot[aws]" + +# For Google Cloud Platform +pip install "skypilot[gcp]" + +# For Azure +pip install "skypilot[azure]" + +# For multiple platforms +pip install "skypilot[kubernetes,aws,gcp,azure]" +``` + +### Step 2: Configure Your Platform + +See https://docs.skypilot.co/en/latest/getting-started/installation.html + +### Step 3: Set Up Environment Variables + +Export necessary API keys for experiment tracking: + +```bash +# For Weights & Biases tracking +export WANDB_API_KEY="your-wandb-api-key" + +# For HuggingFace gated models (if needed) +export HF_TOKEN="your-huggingface-token" +``` + +## Examples + +### PPO Training +```bash +sky launch -c verl-ppo verl-ppo.yaml --secret WANDB_API_KEY -y +``` +Runs PPO training on GSM8K dataset using Qwen2.5-0.5B-Instruct model across 2 nodes with H100 GPUs. Based on examples in [`../ppo_trainer/`](../ppo_trainer/). + +### GRPO Training +```bash +sky launch -c verl-grpo verl-grpo.yaml --secret WANDB_API_KEY -y +``` +Runs GRPO (Group Relative Policy Optimization) training on MATH dataset using Qwen2.5-7B-Instruct model. Memory-optimized configuration for 2 nodes. Based on examples in [`../grpo_trainer/`](../grpo_trainer/). + +### Multi-turn Tool Usage Training +```bash +sky launch -c verl-multiturn verl-multiturn-tools.yaml --secret WANDB_API_KEY --secret HF_TOKEN -y +``` +Single-node training with 8xH100 GPUs for multi-turn tool usage with Qwen2.5-3B-Instruct. Includes tool and interaction configurations for GSM8K. Based on examples in [`../sglang_multiturn/`](../sglang_multiturn/) but uses vLLM instead of sglang. + +## Configuration + +The example YAML files are pre-configured with: + +- **Infrastructure**: Kubernetes clusters (`infra: k8s`) - can be changed to `infra: aws` or `infra: gcp`, etc. +- **Docker Image**: verl's official Docker image with CUDA 12.6 support +- **Setup**: Automatically clones and installs verl from source +- **Datasets**: Downloads required datasets during setup phase +- **Ray Cluster**: Configures distributed training across nodes +- **Logging**: Supports Weights & Biases via `--secret WANDB_API_KEY` +- **Models**: Supports gated HuggingFace models via `--secret HF_TOKEN` + +## Launch Command Options + +- `-c `: Cluster name for managing the job +- `--secret KEY`: Pass secrets for API keys (can be used multiple times) +- `-y`: Skip confirmation prompt + +## Monitoring Your Jobs + +### Check cluster status +```bash +sky status +``` + +### View logs +```bash +sky logs verl-ppo # View logs for the PPO job +``` + +### SSH into head node +```bash +ssh verl-ppo +``` + +### Access Ray dashboard +```bash +sky status --endpoint 8265 verl-ppo # Get dashboard URL +``` + +### Stop a cluster +```bash +sky down verl-ppo +``` diff --git a/code/RL_model/verl/verl_train/examples/skypilot/verl-grpo.yaml b/code/RL_model/verl/verl_train/examples/skypilot/verl-grpo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3d51855d1fd05befbffc7298bca8b6619d66d79 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/skypilot/verl-grpo.yaml @@ -0,0 +1,99 @@ +resources: + infra: k8s + accelerators: H100:1 + memory: 128+ + image_id: docker:verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 + ports: 8265 + +num_nodes: 2 + +secrets: + WANDB_API_KEY: + +setup: | + rm -rf verl + git clone https://github.com/volcengine/verl.git + cd verl + pip3 install -v -e .[vllm] + pip3 install flashinfer-python + echo "Downloading Math dataset..." + mkdir -p ~/data/math + python3 "$(pwd)/examples/data_preprocess/math_dataset.py" --local_dir ~/data/math + echo "Math dataset download completed" + +run: | + HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + NUM_NODES=$SKYPILOT_NUM_NODES + NUM_GPUS_PER_NODE=$SKYPILOT_NUM_GPUS_PER_NODE + + if [ "$SKYPILOT_NODE_RANK" == "0" ]; then + echo "Starting Ray head node..." + ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats \ + --port=6379 \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8265 + + # Wait for all worker nodes to join + retry_count=0 + max_retries=30 + while [ $retry_count -lt $max_retries ]; do + connected_nodes=$(ray status 2>/dev/null | grep -c "node_" || echo "0") + echo "Connected nodes: $connected_nodes/$NUM_NODES (attempt $((retry_count+1))/$max_retries)" + + if [ "$connected_nodes" -ge "$NUM_NODES" ]; then + echo "All nodes connected to Ray cluster" + break + fi + + retry_count=$((retry_count+1)) + sleep 10 + done + + python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/math/train.parquet \ + data.val_files=$HOME/data/math/test.parquet \ + data.train_batch_size=32 \ + data.max_prompt_length=256 \ + data.max_response_length=256 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=16 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.actor.ppo_epochs=1 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=1 \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.max_num_batched_tokens=2048 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=[console,wandb] \ + trainer.project_name=verl_math_grpo_demo \ + trainer.experiment_name=qwen25_7b_grpo \ + trainer.n_gpus_per_node=$NUM_GPUS_PER_NODE \ + trainer.nnodes=$NUM_NODES \ + trainer.save_freq=-1 \ + trainer.test_freq=-1 \ + trainer.total_epochs=1 + + else + sleep 15 + echo "Starting Ray worker node..." + ps aux | grep ray | grep $HEAD_IP:6379 &> /dev/null || ray start --address $HEAD_IP:6379 --disable-usage-stats + sleep 10 + fi + + echo "Node setup and Ray start script finished for rank $SKYPILOT_NODE_RANK." \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/skypilot/verl-multiturn-tools.yaml b/code/RL_model/verl/verl_train/examples/skypilot/verl-multiturn-tools.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7496ad83061ab572e9d405a668276ba0004b0864 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/skypilot/verl-multiturn-tools.yaml @@ -0,0 +1,91 @@ +resources: + infra: k8s + accelerators: H100:8 + memory: 128+ + image_id: docker:verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 + ports: 8265 + +num_nodes: 1 + +secrets: + WANDB_API_KEY: + HF_TOKEN: # in case you're using gated models from the HF hub + +setup: | + rm -rf verl + git clone https://github.com/volcengine/verl.git + cd verl + pip3 install -v -e .[vllm] + pip3 install flashinfer-python + pip install "transformers<4.54.0" # https://github.com/vllm-project/vllm-ascend/issues/2046 + # Download GSM8K dataset for multiturn tool training + echo "Downloading GSM8K dataset..." + mkdir -p ~/data/gsm8k + python3 "$(pwd)/examples/data_preprocess/gsm8k.py" --local_dir ~/data/gsm8k + echo "GSM8K dataset download completed" + +run: | + NUM_GPUS_PER_NODE=$SKYPILOT_NUM_GPUS_PER_NODE + PROJECT_DIR="$(pwd)/verl" + CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config" + + # Single node setup - no worker coordination needed + echo "Starting Ray head node..." + ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats \ + --port=6379 \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8265 + + cd verl + + python3 -m verl.trainer.main_ppo \ + --config-path="$CONFIG_PATH" \ + --config-name='gsm8k_multiturn_grpo' \ + algorithm.adv_estimator=grpo \ + data.train_batch_size=512 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=512 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=[console,wandb] \ + trainer.project_name=verl_multiturn_tools \ + trainer.experiment_name=qwen25_7b_gsm8k_multiturn_tools \ + trainer.n_gpus_per_node=$NUM_GPUS_PER_NODE \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=5 \ + trainer.total_epochs=10 \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=8192 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=8192 \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192 \ + critic.ppo_max_token_len_per_gpu=8192 \ + critic.forward_max_token_len_per_gpu=8192 \ + actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \ + actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \ + actor_rollout_ref.rollout.multi_turn.max_user_turns=1 + + echo "Node setup and Ray start script finished for rank $SKYPILOT_NODE_RANK." \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/skypilot/verl-ppo.yaml b/code/RL_model/verl/verl_train/examples/skypilot/verl-ppo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1ba8de45aec6fcb19803b0c20c35f7c81f433d3 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/skypilot/verl-ppo.yaml @@ -0,0 +1,109 @@ +resources: + infra: k8s + accelerators: H100:1 + memory: 128+ + image_id: docker:verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 + ports: 8265 + +num_nodes: 2 + +secrets: + WANDB_API_KEY: + +setup: | + rm -rf verl + git clone https://github.com/volcengine/verl.git + cd verl + pip3 install -v -e .[vllm] + pip3 install flashinfer-python + # Download GSM8K dataset - alternative approach + echo "Downloading GSM8K dataset..." + mkdir -p ~/data/gsm8k + # Check if the script exists and use absolute path + if [ -f "$(pwd)/examples/data_preprocess/gsm8k.py" ]; then + python3 "$(pwd)/examples/data_preprocess/gsm8k.py" --local_dir ~/data/gsm8k + else + echo "Warning: gsm8k.py script not found, skipping dataset download" + # You might want to download the dataset manually or use a different approach + fi + echo "GSM8K dataset download completed" + +run: | + # Get the Head node's IP and total number of nodes + HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + NUM_NODES=$SKYPILOT_NUM_NODES + + # login wandb + # python3 -c "import wandb; wandb.login(relogin=True, key='$WANDB_API_KEY')" + + if [ "$SKYPILOT_NODE_RANK" == "0" ]; then + # Head node starts Ray Head + echo "Starting Ray head node..." + ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats \ + --port=6379 \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8265 + + # Wait for all worker nodes to join the cluster with better checking + echo "Waiting for all nodes to join Ray cluster..." + retry_count=0 + max_retries=30 + while [ $retry_count -lt $max_retries ]; do + connected_nodes=$(ray status 2>/dev/null | grep -c "node_" || echo "0") + echo "Connected nodes: $connected_nodes/$NUM_NODES (attempt $((retry_count+1))/$max_retries)" + + if [ "$connected_nodes" -ge "$NUM_NODES" ]; then + echo "All nodes connected to Ray cluster" + break + fi + + retry_count=$((retry_count+1)) + sleep 10 + done + + if [ $retry_count -eq $max_retries ]; then + echo "WARNING: Not all nodes connected to Ray cluster after $max_retries attempts" + echo "Current Ray status:" + ray status + fi + + python3 -m verl.trainer.main_ppo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=256 \ + data.max_prompt_length=512 \ + data.max_response_length=256 \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + critic.optim.lr=1e-5 \ + critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=[console,wandb] \ + trainer.val_before_train=False \ + trainer.default_hdfs_dir=null \ + trainer.n_gpus_per_node=1 \ + trainer.nnodes=2 \ + trainer.save_freq=20 \ + trainer.test_freq=20 \ + trainer.total_epochs=2 \ + trainer.project_name=verl_examples \ + trainer.experiment_name=experiment_name_gsm8k + + else + # Wait for Ray Head to start + sleep 15 + # Worker node starts Ray Worker + echo "Starting Ray worker node..." + ps aux | grep ray | grep $HEAD_IP:6379 &> /dev/null || ray start --address $HEAD_IP:6379 --disable-usage-stats + sleep 10 + fi + + echo "Node setup and Ray start script finished for rank $SKYPILOT_NODE_RANK." \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/examples/split_placement/README.md b/code/RL_model/verl/verl_train/examples/split_placement/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a552972594f9ddd142d6889cdee1a5def55c2939 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/split_placement/README.md @@ -0,0 +1,61 @@ +# Split Placement Example +Here we introduce how to run the naive implementation of the split placement of PPO algorithm. +We will release the complete version of flexible placement in the near future. + + For quickstart, you can only follow Step 2 to modify the code and then follow Step 4 to execute the split placement example. + +### Step 1: Placing the models to different GPUs +Specify the placement and resource allocation. In the example, we place the actor and reference in the first half of the GPUs while map the critic and reward model (if any) to the second half of the GPUs. +```python +actor_rollout_ref_pool_id = 'actor_rollout_ref_pool' +critic_pool_id = 'critic_pool' +if config.trainer.nnodes // 2 == 0 and config.trainer.n_gpus_per_node // 2 > 0: + resource_pool_spec = { + actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes, + critic_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes, + } +else: + resource_pool_spec = { + actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2), + critic_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2), + } +print(f'resource_pool_spec: {resource_pool_spec}') +mapping = { + Role.ActorRollout: actor_rollout_ref_pool_id, + Role.Critic: critic_pool_id, + Role.RefPolicy: actor_rollout_ref_pool_id, +} +mapping[Role.RewardModel] = critic_pool_id +``` + +### Step 2: Make the models executed asynchronously +Based on the model placement, we need to make the models executed asynchronously. + +To do so, you need to turn off the `blocking` flag (i.e., `blocking=False`) in our decorator of some model operations. +For example, we hope the actor update and critic update can be executed in parallel, then we need to make the following modification in `fsdp_workers.py` + +``` +@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) +def update_actor(self, data: DataProto): + ... + +@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False) +def update_critic(self, data: DataProto): + ... +``` + +We can also parallelize the computation of `ref_log_prob` and `values` and `rewards` in the split placement. For simplicity of the tutorial, we don't do this in this example. + +### Step 3: Execute these operation in parallel in the single controller process +To implement the parallel execution of the actor and critic update, the only thing we need to modify in the `ray_trainer.py` is to `get` the concurrent `futures` on the single controller process. + +```python +critic_output = critic_output.get() +actor_output = actor_output.get() +``` + +### Step 4: Run the split placement example + +``` +bash run_deepseek7b_llm.sh +``` diff --git a/code/RL_model/verl/verl_train/examples/split_placement/main_ppo_split.py b/code/RL_model/verl/verl_train/examples/split_placement/main_ppo_split.py new file mode 100644 index 0000000000000000000000000000000000000000..e619d9d3965d8967186ae611308544df75b886f0 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/split_placement/main_ppo_split.py @@ -0,0 +1,217 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Note that we don't combine the main with ray_trainer as ray_trainer is used by other main. +""" + +import hydra +import ray +import torch +from omegaconf import OmegaConf +from split_monkey_patch import fit + +from verl import DataProto +from verl.trainer.ppo.ray_trainer import RayPPOTrainer +from verl.trainer.ppo.utils import need_reference_policy +from verl.utils.reward_score import gsm8k, math_reward + + +def _select_rm_score_fn(data_source): + if data_source == "openai/gsm8k": + return gsm8k.compute_score + elif data_source == "lighteval/MATH": + return math_reward.compute_score + else: + raise NotImplementedError + + +class RewardManager: + def __init__(self, tokenizer, num_examine) -> None: + self.tokenizer = tokenizer + self.num_examine = num_examine # the number of batches of decoded responses to print to the console + + def __call__(self, data: DataProto, return_dict: bool = False): + """We will expand this function gradually based on the available datasets""" + + # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn + if "rm_scores" in data.batch.keys(): + return data.batch["rm_scores"] + + reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32) + + already_print_data_sources = {} + + for i in range(len(data)): + data_item = data[i] # DataProtoItem + + prompt_ids = data_item.batch["prompts"] + + prompt_length = prompt_ids.shape[-1] + + valid_prompt_length = data_item.batch["attention_mask"][:prompt_length].sum() + valid_prompt_ids = prompt_ids[-valid_prompt_length:] + + response_ids = data_item.batch["responses"] + valid_response_length = data_item.batch["attention_mask"][prompt_length:].sum() + valid_response_ids = response_ids[:valid_response_length] + + # decode + sequences = torch.cat((valid_prompt_ids, valid_response_ids)) + sequences_str = self.tokenizer.decode(sequences) + + ground_truth = data_item.non_tensor_batch["reward_model"]["ground_truth"] + + # select rm_score + data_source = data_item.non_tensor_batch["data_source"] + compute_score_fn = _select_rm_score_fn(data_source) + + score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth) + reward_tensor[i, valid_response_length - 1] = score + + if data_source not in already_print_data_sources: + already_print_data_sources[data_source] = 0 + + if already_print_data_sources[data_source] < self.num_examine: + already_print_data_sources[data_source] += 1 + print(sequences_str) + + if return_dict: + return {"reward_tensor": reward_tensor} + else: + return reward_tensor + + +@hydra.main(config_path="config", config_name="ppo_trainer_split", version_base=None) +def main(config): + if not ray.is_initialized(): + # this is for local ray cluster + default_runtime_env = {"env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN"}} + ray_init_kwargs = config.ray_kwargs.get("ray_init", {}) + runtime_env_kwargs = ray_init_kwargs.get("runtime_env", {}) + runtime_env = OmegaConf.merge(default_runtime_env, runtime_env_kwargs) + ray_init_kwargs = OmegaConf.create({**ray_init_kwargs, "runtime_env": runtime_env}) + print(f"ray init kwargs: {ray_init_kwargs}") + ray.init(**OmegaConf.to_container(ray_init_kwargs)) + + ray.get(main_task.remote(config)) + + +@ray.remote +def main_task(config): + # print initial config + from pprint import pprint + + from omegaconf import OmegaConf + + from verl.utils.fs import copy_to_local + + pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values + OmegaConf.resolve(config) + + # download the checkpoint from hdfs + local_path = copy_to_local(config.actor_rollout_ref.model.path) + + # instantiate tokenizer + from verl.utils import hf_tokenizer + + tokenizer = hf_tokenizer(local_path) + + # define worker classes + if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: + assert config.critic.strategy in {"fsdp", "fsdp2"} + from verl.single_controller.ray import RayWorkerGroup + from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker + + ray_worker_group_cls = RayWorkerGroup + + elif config.actor_rollout_ref.actor.strategy == "megatron": + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from verl.single_controller.ray import RayWorkerGroup + from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker + + ray_worker_group_cls = RayWorkerGroup + + else: + raise NotImplementedError + + from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role + + role_worker_mapping = { + Role.ActorRollout: ray.remote(ActorRolloutRefWorker), + Role.Critic: ray.remote(CriticWorker), + } + + # NOTE: initialze two resource pool + actor_rollout_ref_pool_id = "actor_rollout_ref_pool" + critic_pool_id = "critic_pool" + if config.trainer.nnodes // 2 == 0 and config.trainer.n_gpus_per_node // 2 > 0: + resource_pool_spec = { + actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes, + critic_pool_id: [config.trainer.n_gpus_per_node // 2] * config.trainer.nnodes, + } + else: + resource_pool_spec = { + actor_rollout_ref_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2), + critic_pool_id: [config.trainer.n_gpus_per_node] * (config.trainer.nnodes // 2), + } + print(f"resource_pool_spec: {resource_pool_spec}") + mapping = { + Role.ActorRollout: actor_rollout_ref_pool_id, + Role.Critic: critic_pool_id, + } + + # use reference model + if need_reference_policy(config): + role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) + mapping[Role.RefPolicy] = actor_rollout_ref_pool_id + + # we should adopt a multi-source reward function here + # - for rule-based rm, we directly call a reward score + # - for model-based rm, we call a model + # - for code related prompt, we send to a sandbox if there are test cases + # - finally, we combine all the rewards together + # - The reward type depends on the tag of the data + if config.reward_model.enable: + if config.reward_model.strategy in {"fsdp", "fsdp2"}: + from verl.workers.fsdp_workers import RewardModelWorker + elif config.reward_model.strategy == "megatron": + from verl.workers.megatron_workers import RewardModelWorker + else: + raise NotImplementedError + role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) + mapping[Role.RewardModel] = critic_pool_id + + reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0) + + # Note that we always use function-based RM for validation + val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1) + + resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) + + RayPPOTrainer.fit = fit + trainer = RayPPOTrainer( + config=config, + tokenizer=tokenizer, + role_worker_mapping=role_worker_mapping, + resource_pool_manager=resource_pool_manager, + ray_worker_group_cls=ray_worker_group_cls, + reward_fn=reward_fn, + val_reward_fn=val_reward_fn, + ) + trainer.init_workers() + trainer.fit() + + +if __name__ == "__main__": + main() diff --git a/code/RL_model/verl/verl_train/examples/split_placement/run_deepseek7b_llm.sh b/code/RL_model/verl/verl_train/examples/split_placement/run_deepseek7b_llm.sh new file mode 100644 index 0000000000000000000000000000000000000000..473dcccdd9bb355b43c93700bc0ccbe3de379b57 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/split_placement/run_deepseek7b_llm.sh @@ -0,0 +1,37 @@ +set -x + +python3 main_ppo_split.py \ + algorithm.adv_estimator=gae \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + critic.optim.lr=1e-5 \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.model.enable_gradient_checkpointing=False \ + critic.ppo_micro_batch_size_per_gpu=8 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='deepseek_llm_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.total_epochs=15 $@ diff --git a/code/RL_model/verl/verl_train/examples/split_placement/split_monkey_patch.py b/code/RL_model/verl/verl_train/examples/split_placement/split_monkey_patch.py new file mode 100644 index 0000000000000000000000000000000000000000..8cc73083dfd2755a013b099d86fd0ed75423d1d0 --- /dev/null +++ b/code/RL_model/verl/verl_train/examples/split_placement/split_monkey_patch.py @@ -0,0 +1,237 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +An naive implementation of split placment example +""" + +import uuid +from copy import deepcopy +from pprint import pprint + +import numpy as np +import torch + +from verl import DataProto +from verl.trainer.ppo.ray_trainer import ( + AdvantageEstimator, + apply_kl_penalty, + compute_advantage, + compute_data_metrics, + compute_timing_metrics, + marked_timer, +) +from verl.trainer.ppo.reward import compute_reward +from verl.utils.metric import reduce_metrics + + +def fit(self): + """ + The training loop of PPO. + The driver process only need to call the compute functions of the worker group through RPC + to construct the PPO dataflow. + The light-weight advantage computation is done on the driver process. + """ + from omegaconf import OmegaConf + + from verl.utils.tracking import Tracking + + logger = Tracking( + project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger, + config=OmegaConf.to_container(self.config, resolve=True), + ) + + self.global_steps = 0 + + # load checkpoint before doing anything + self._load_checkpoint() + + # perform validation before training + # currently, we only support validation using the reward_function. + if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True): + val_metrics = self._validate() + pprint(f"Initial validation metrics: {val_metrics}") + logger.log(data=val_metrics, step=self.global_steps) + if self.config.trainer.get("val_only", False): + return + + # we start from step 1 + self.global_steps += 1 + last_val_metrics = None + + for epoch in range(self.config.trainer.total_epochs): + for batch_dict in self.train_dataloader: + metrics = {} + timing_raw = {} + + batch: DataProto = DataProto.from_single_dict(batch_dict) + + # pop those keys for generation + gen_batch = batch.pop(batch_keys=["input_ids", "attention_mask", "position_ids"]) + is_last_step = self.global_steps >= self.total_training_steps + + with marked_timer("step", timing_raw): + # generate a batch + with marked_timer("gen", timing_raw): + gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) + timing_raw.update(gen_batch_output.meta_info["timing"]) + gen_batch_output.meta_info.pop("timing", None) + + if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX: + with marked_timer("gen_max", timing_raw): + gen_baseline_batch = deepcopy(gen_batch) + gen_baseline_batch.meta_info["do_sample"] = False + gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch) + + batch = batch.union(gen_baseline_output) + # compute reward model score on batch + rm_scores = None + if self.use_rm and "rm_scores" not in batch.batch.keys(): + rm_scores = self.rm_wg.compute_rm_score(batch) + batch = batch.union(rm_scores) + reward_baseline_tensor, _ = compute_reward(batch, self.reward_fn) + reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1) + + keys_to_pop = set(gen_baseline_output.batch.keys()) + if rm_scores is not None: + keys_to_pop.update(rm_scores.batch.keys()) + batch.pop(batch_keys=list(keys_to_pop)) + + batch.batch["reward_baselines"] = reward_baseline_tensor + + del rm_scores, gen_baseline_batch, gen_baseline_output + + batch.non_tensor_batch["uid"] = np.array( + [str(uuid.uuid4()) for _ in range(len(batch.batch))], dtype=object + ) + # repeat to align with repeated responses in rollout + batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True) + batch = batch.union(gen_batch_output) + + # Balance the number of valid tokens across DP ranks. + # NOTE: This usually changes the order of data in the `batch`, + # which won't affect the advantage calculation (since it's based on uid), + # but might affect the loss calculation (due to the change of mini-batching). + # TODO: Decouple the DP balancing and mini-batching. + self._balance_batch(batch, metrics=metrics) + + # compute global_valid tokens + batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist() + + # recompute old_log_probs + with marked_timer("old_log_prob", timing_raw): + old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) + batch = batch.union(old_log_prob) + + if self.use_reference_policy: + # compute reference log_prob + with marked_timer("ref", timing_raw): + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) + batch = batch.union(ref_log_prob) + + # compute values + if self.use_critic: + with marked_timer("values", timing_raw): + values = self.critic_wg.compute_values(batch) + batch = batch.union(values) + + with marked_timer("adv", timing_raw): + # compute scores. Support both model and function-based. + # We first compute the scores using reward model. Then, we call reward_fn to combine + # the results from reward model and rule-based results. + if self.use_rm and "rm_scores" not in batch.batch.keys(): + # we first compute reward model score + reward_tensor = self.rm_wg.compute_rm_score(batch) + batch = batch.union(reward_tensor) + + # we combine with rule-based rm + reward_tensor, _ = compute_reward(batch, self.reward_fn) + batch.batch["token_level_scores"] = reward_tensor + + # compute rewards. apply_kl_penalty if available + if self.config.algorithm.use_kl_in_reward: + batch, kl_metrics = apply_kl_penalty( + batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty + ) + metrics.update(kl_metrics) + else: + batch.batch["token_level_rewards"] = batch.batch["token_level_scores"] + + # compute advantages, executed on the driver process + norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True) + batch = compute_advantage( + batch, + adv_estimator=self.config.algorithm.adv_estimator, + gamma=self.config.algorithm.gamma, + lam=self.config.algorithm.lam, + num_repeat=self.config.actor_rollout_ref.rollout.n, + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + config=self.config.algorithm, + ) + + # implement critic warmup + if self.config.trainer.critic_warmup <= self.global_steps: + # update actor + with marked_timer("update_actor_call", timing_raw): + actor_output = self.actor_rollout_wg.update_actor(batch) + else: + actor_output = None + + # update critic + if self.use_critic: + with marked_timer("update_critic_call", timing_raw): + critic_output = self.critic_wg.update_critic(batch) + + # NOTE: make sure you set blocking=False in update_actor and update_crtic in the worker class + with marked_timer("update_actor_critic", timing_raw): + critic_output = critic_output.get() + critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) + metrics.update(critic_output_metrics) + + if actor_output is not None: + actor_output = actor_output.get() + actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) + metrics.update(actor_output_metrics) + + # validate + if ( + self.val_reward_fn is not None + and self.config.trainer.test_freq > 0 + and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0) + ): + with marked_timer("testing", timing_raw): + val_metrics: dict = self._validate() + if is_last_step: + last_val_metrics = val_metrics + metrics.update(val_metrics) + + if self.config.trainer.save_freq > 0 and ( + is_last_step or self.global_steps % self.config.trainer.save_freq == 0 + ): + with marked_timer("save_checkpoint", timing_raw): + self._save_checkpoint() + + # collect metrics + metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) + metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw)) + + # TODO: make a canonical logger that supports various backend + logger.log(data=metrics, step=self.global_steps) + + if self.global_steps >= self.total_training_steps: + pprint(f"Final validation metrics: {last_val_metrics}") + return + + self.global_steps += 1 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e97a781ec2cbe0fc369a5145e488ef62e0f9d7e --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/config.yaml @@ -0,0 +1,814 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + amll25lbtbfk9iytu1vtjmzh3eeznave: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=38951 + - --object-store-name=/tmp/ray/session_2026-02-01_22-23-18_768519_1816845/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-01_22-23-18_768519_1816845/sockets/raylet + - --redis-address=None + - --metrics-agent-port=59399 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=56552 + - --gcs-address=172.16.34.29:63917 + - --session-name=session_2026-02-01_22-23-18_768519_1816845 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8301 + - --cluster-id=93d3a0fb1eee93035a0185401a4444c9d568b2bd6eb31672bb611484 + - --startup-token=128 + - --worker-launch-time-ms=1770002613340 + - --node-id=d3806b374612df35510a7b4102a4580f233e4ec33d21441cde3a65c1 + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "184645013504" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-02T03:29:49.741937Z" + writerId: amll25lbtbfk9iytu1vtjmzh3eeznave + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "2": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + - 61 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 105 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 64 + ppo_mini_batch_size: 512 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 5 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 64 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 5 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: false + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.7 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 64 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 5 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 512 + quantization: null + quantization_config_file: null + response_length: 1024 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 105 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 512 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 5 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 512 + max_response_length: 1024 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 1024 + train_files: /home/mshahidul/data/gsm8k/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/data/gsm8k/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 20 + test_freq: 5 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3224ab06776c2de9dd9c8255874882ea40a66448 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/output.log @@ -0,0 +1,79 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/openai/gsm8k/reward/mean@1': " + "np.float64(0.0), 'val-core/openai/gsm8k/acc/mean@1': np.float64(0.0), " + "'val-aux/num_turns/min': np.int32(2), 'val-aux/num_turns/max': np.int32(2), " + "'val-aux/num_turns/mean': np.float64(2.0)}") +step:0 - val-aux/openai/gsm8k/reward/mean@1:np.float64(0.0) - val-core/openai/gsm8k/acc/mean@1:np.float64(0.0) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/105 [00:00) + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/concurrent/futures/_base.py", line 456, in result + return self.__get_result() + ^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result + raise self._exception + ^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/single_controller/ray/base.py", line 910, in func + return getattr(self.worker_dict[key], name)(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/single_controller/base/decorator.py", line 462, in inner + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/transferqueue_utils.py", line 314, in dummy_inner + output = func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/profiler/profile.py", line 173, in wrapper + return func(self_instance, *args, **kwargs_inner) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/workers/fsdp_workers.py", line 1058, in compute_ref_log_prob + outputs = self.ref_policy.compute_log_prob(data=data, calculate_entropy=False) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/profiler/performance.py", line 105, in f + return self.log(decorated_function, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/utils/profiler/performance.py", line 118, in log + output = func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/workers/actor/dp_actor.py", line 472, in compute_log_prob + outputs = self._forward_micro_batch( + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/workers/actor/dp_actor.py", line 244, in _forward_micro_batch + output = self.actor_module( + ^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 854, in forward + output = self._fsdp_wrapped_module(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/transformers/utils/generic.py", line 940, in wrapper + output = func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 494, in forward + logits = self.lm_head(hidden_states[:, slice_indices, :]) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/torch/nn/modules/linear.py", line 125, in forward + return F.linear(input, self.weight, self.bias) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 12.07 GiB. GPU 0 has a total capacity of 79.25 GiB of which 8.37 GiB is free. Including non-PyTorch memory, this process has 69.25 GiB memory in use. Process 1831585 has 1.58 GiB memory in use. Of the allocated memory 68.00 GiB is allocated by PyTorch, and 638.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..58b4ec54fa2f6ed38448fc9311264354380a5821 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/requirements.txt @@ -0,0 +1,268 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ddf5378bab484f396c9be0066e7ae59385fbaba7 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T03:29:49.741937Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=38951", + "--object-store-name=/tmp/ray/session_2026-02-01_22-23-18_768519_1816845/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-01_22-23-18_768519_1816845/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=59399", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=56552", + "--gcs-address=172.16.34.29:63917", + "--session-name=session_2026-02-01_22-23-18_768519_1816845", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=93d3a0fb1eee93035a0185401a4444c9d568b2bd6eb31672bb611484", + "--startup-token=128", + "--worker-launch-time-ms=1770002613340", + "--node-id=d3806b374612df35510a7b4102a4580f233e4ec33d21441cde3a65c1", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "184645013504" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "amll25lbtbfk9iytu1vtjmzh3eeznave" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c89a04866e6a3bb599f910731d47f357ee46708 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/files/wandb-summary.json @@ -0,0 +1 @@ +{"_step":0,"_wandb":{"runtime":442},"_timestamp":1.770003052452084e+09,"val-aux/num_turns/max":2,"val-aux/num_turns/min":2,"val-aux/openai/gsm8k/reward/mean@1":0,"val-aux/num_turns/mean":2,"val-core/openai/gsm8k/acc/mean@1":0,"_runtime":442.155582418} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..461246917524219c4c912a95807b7cbd8d8f6c8b --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug-core.log @@ -0,0 +1,12 @@ +{"time":"2026-02-01T22:29:50.729717733-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp451kvzg0/port-1825920.txt","pid":1825920,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-01T22:29:50.732595817-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1825920} +{"time":"2026-02-01T22:29:50.732407964-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1825920-1837376-1351032451/socket","Net":"unix"}} +{"time":"2026-02-01T22:29:50.836590485-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-01T22:29:50.880046708-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"yk5vgzhp","id":"1(@)"} +{"time":"2026-02-01T22:29:51.897610704-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"yk5vgzhp","id":"1(@)"} +{"time":"2026-02-01T22:29:58.683861179-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"gg3zdwg3i0cs"} +{"time":"2026-02-01T22:37:14.341303997-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"gg3zdwg3i0cs"} +{"time":"2026-02-01T22:37:14.970172132-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"gg3zdwg3i0cs"} +{"time":"2026-02-01T22:37:14.974024448-05:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"yk5vgzhp","id":"1(@)"} +{"time":"2026-02-01T22:37:14.977355948-05:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"yk5vgzhp","id":"1(@)"} +{"time":"2026-02-01T22:37:17.174817942-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d07f417faa563551235269555aee2e2bd1182887 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-01T22:29:50.882708284-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-01T22:29:51.892501209-05:00","level":"INFO","msg":"stream: created new stream","id":"yk5vgzhp"} +{"time":"2026-02-01T22:29:51.894405185-05:00","level":"INFO","msg":"handler: started","stream_id":"yk5vgzhp"} +{"time":"2026-02-01T22:29:51.897572528-05:00","level":"INFO","msg":"stream: started","id":"yk5vgzhp"} +{"time":"2026-02-01T22:29:51.897608959-05:00","level":"INFO","msg":"writer: started","stream_id":"yk5vgzhp"} +{"time":"2026-02-01T22:29:51.897648519-05:00","level":"INFO","msg":"sender: started","stream_id":"yk5vgzhp"} +{"time":"2026-02-01T22:37:14.813218378-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-01T22:37:14.963800708-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2026-02-01T22:37:14.97406244-05:00","level":"INFO","msg":"stream: closing","id":"yk5vgzhp"} +{"time":"2026-02-01T22:37:14.974087535-05:00","level":"INFO","msg":"handler: closed","stream_id":"yk5vgzhp"} +{"time":"2026-02-01T22:37:14.976564824-05:00","level":"INFO","msg":"sender: closed","stream_id":"yk5vgzhp"} +{"time":"2026-02-01T22:37:14.976610879-05:00","level":"INFO","msg":"stream: closed","id":"yk5vgzhp"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..75758f5ac3d870768ac2df199425266464b0425d --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug.log @@ -0,0 +1,24 @@ +2026-02-01 22:29:49,757 INFO MainThread:1825920 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-01 22:29:49,758 INFO MainThread:1825920 [wandb_setup.py:_flush():81] Configure stats pid to 1825920 +2026-02-01 22:29:49,758 INFO MainThread:1825920 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-01 22:29:49,758 INFO MainThread:1825920 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug.log +2026-02-01 22:29:49,758 INFO MainThread:1825920 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/logs/debug-internal.log +2026-02-01 22:29:49,759 INFO MainThread:1825920 [wandb_init.py:init():844] calling init triggers +2026-02-01 22:29:49,760 INFO MainThread:1825920 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 105, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 5, 'strategy': 'fsdp', 'ppo_mini_batch_size': 512, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 5, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 512, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.7, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 5, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/data/gsm8k/train.parquet', 'val_files': '/home/mshahidul/data/gsm8k/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 512, 'max_response_length': 1024, 'train_batch_size': 1024, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 105, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 5, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 512, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': 'checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs', 'max_actor_ckpt_to_keep': None, 'max_critic_ckpt_to_keep': None, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto'}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-01 22:29:49,760 INFO MainThread:1825920 [wandb_init.py:init():892] starting backend +2026-02-01 22:29:50,833 INFO MainThread:1825920 [wandb_init.py:init():895] sending inform_init request +2026-02-01 22:29:50,869 INFO MainThread:1825920 [wandb_init.py:init():903] backend started and connected +2026-02-01 22:29:50,881 INFO MainThread:1825920 [wandb_init.py:init():973] updated telemetry +2026-02-01 22:29:50,915 INFO MainThread:1825920 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-01 22:29:52,181 INFO MainThread:1825920 [wandb_init.py:init():1042] starting run threads in backend +2026-02-01 22:29:53,656 INFO MainThread:1825920 [wandb_run.py:_console_start():2529] atexit reg +2026-02-01 22:29:53,656 INFO MainThread:1825920 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-01 22:29:53,656 INFO MainThread:1825920 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-01 22:29:53,656 INFO MainThread:1825920 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-01 22:29:53,696 INFO MainThread:1825920 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-01 22:37:14,333 INFO MainThread:1825920 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/yk5vgzhp +2026-02-01 22:37:14,338 INFO MainThread:1825920 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-01 22:37:14,339 INFO MainThread:1825920 [wandb_run.py:_restore():2476] restore +2026-02-01 22:37:14,340 INFO MainThread:1825920 [wandb_run.py:_restore():2482] restore done +2026-02-01 22:37:14,970 INFO MainThread:1825920 [wandb_run.py:_footer_sync_info():3871] logging synced files diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5b051dbdd35af029330a4e58d1d24cd173c74c81 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/output.log @@ -0,0 +1,23 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/openai/gsm8k/reward/mean@1': " + "np.float64(0.0), 'val-core/openai/gsm8k/acc/mean@1': np.float64(0.0), " + "'val-aux/num_turns/min': np.int32(2), 'val-aux/num_turns/max': np.int32(2), " + "'val-aux/num_turns/mean': np.float64(2.0)}") +step:0 - val-aux/openai/gsm8k/reward/mean@1:np.float64(0.0) - val-core/openai/gsm8k/acc/mean@1:np.float64(0.0) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 4%|▍ | 8/210 [43:19<17:42:11, 315.50s/it] +step:1 - global_seqlen/min:318997 - global_seqlen/max:329643 - global_seqlen/minmax_diff:10646 - global_seqlen/balanced_min:324320 - global_seqlen/balanced_max:324320 - global_seqlen/mean:324320.0 - actor/entropy:0.12505832314491272 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.19641268732332814) - perf/max_memory_allocated_gb:np.float64(57.626702308654785) - perf/max_memory_reserved_gb:np.float64(72.806640625) - perf/cpu_memory_used_gb:np.float64(628.5751190185547) - actor/lr:np.float64(1e-06) - training/global_step:1 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:252.7369842529297 - response_length/max:768.0 - response_length/min:72.0 - response_length/clip_ratio:0.02604166604578495 - response_length_non_aborted/mean:252.7369842529297 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:72.0 - response_length_non_aborted/clip_ratio:0.02604166604578495 - response/aborted_ratio:0.0 - prompt_length/mean:169.5546875 - prompt_length/max:286.0 - prompt_length/min:129.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0006777718663215637 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(8.79530194029212) - timing_s/agent_loop/generate_sequences/max:np.float64(33.35468749515712) - timing_s/agent_loop/generate_sequences/mean:np.float64(19.750638691024506) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(33.35468749515712) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:166 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:37.631233430467546 - timing_s/reward:0.0002766614779829979 - timing_s/old_log_prob:53.294130026362836 - timing_s/ref:89.15221855975688 - timing_s/adv:0.07310293056070805 - timing_s/update_actor:142.75187538284808 - timing_s/update_weights:52.132928838953376 - timing_s/step:375.7928413292393 - timing_s/stop_profile:0.00024086330085992813 - timing_per_token_ms/ref:0.13744483621077466 - timing_per_token_ms/gen:0.09693674828303558 - timing_per_token_ms/adv:0.00011270185397247787 - timing_per_token_ms/update_actor:0.220078742265121 - perf/total_num_tokens:648640 - perf/time_per_step:375.7928413292393 - perf/throughput:863.0286805167133 +step:2 - global_seqlen/min:323793 - global_seqlen/max:329749 - global_seqlen/minmax_diff:5956 - global_seqlen/balanced_min:326771 - global_seqlen/balanced_max:326771 - global_seqlen/mean:326771.0 - actor/entropy:0.13981616497039795 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.2168887847103933) - perf/max_memory_allocated_gb:np.float64(65.87330293655396) - perf/max_memory_reserved_gb:np.float64(73.822265625) - perf/cpu_memory_used_gb:np.float64(629.0960922241211) - actor/lr:np.float64(1e-06) - training/global_step:2 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:257.1334533691406 - response_length/max:768.0 - response_length/min:80.0 - response_length/clip_ratio:0.0384114570915699 - response_length_non_aborted/mean:257.1334533691406 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:80.0 - response_length_non_aborted/clip_ratio:0.0384114570915699 - response/aborted_ratio:0.0 - prompt_length/mean:168.349609375 - prompt_length/max:279.0 - prompt_length/min:132.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.119813770055771e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(9.270651680417359) - timing_s/agent_loop/generate_sequences/max:np.float64(35.99071122240275) - timing_s/agent_loop/generate_sequences/mean:np.float64(20.42490219332103) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.99071122240275) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:142 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:40.38632332812995 - timing_s/reward:0.0001812586560845375 - timing_s/old_log_prob:48.146111900918186 - timing_s/ref:48.91424604319036 - timing_s/adv:0.09017631784081459 - timing_s/update_actor:130.3168063648045 - timing_s/update_weights:55.40812928136438 - timing_s/step:324.01379853580147 - timing_s/stop_profile:0.0001643802970647812 - timing_per_token_ms/ref:0.0748448394184159 - timing_per_token_ms/gen:0.10225498808257595 - timing_per_token_ms/adv:0.0001379809068748674 - timing_per_token_ms/update_actor:0.19940081335982157 - perf/total_num_tokens:653542 - perf/time_per_step:324.01379853580147 - perf/throughput:1008.5095186583354 +step:3 - global_seqlen/min:308578 - global_seqlen/max:314939 - global_seqlen/minmax_diff:6361 - global_seqlen/balanced_min:311758 - global_seqlen/balanced_max:311759 - global_seqlen/mean:311758.5 - actor/entropy:0.12481798976659775 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.21467723995353935) - perf/max_memory_allocated_gb:np.float64(65.87330293655396) - perf/max_memory_reserved_gb:np.float64(73.822265625) - perf/cpu_memory_used_gb:np.float64(647.254243850708) - actor/lr:np.float64(1e-06) - training/global_step:3 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:238.791015625 - response_length/max:768.0 - response_length/min:64.0 - response_length/clip_ratio:0.021484375 - response_length_non_aborted/mean:238.791015625 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:64.0 - response_length_non_aborted/clip_ratio:0.021484375 - response/aborted_ratio:0.0 - prompt_length/mean:167.14453125 - prompt_length/max:320.0 - prompt_length/min:130.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00010171346366405487 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(8.569516553543508) - timing_s/agent_loop/generate_sequences/max:np.float64(32.7550454903394) - timing_s/agent_loop/generate_sequences/mean:np.float64(19.676890378410462) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(32.7550454903394) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:150 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:38.46861767116934 - timing_s/reward:0.00014963746070861816 - timing_s/old_log_prob:45.267706642858684 - timing_s/ref:44.06274450849742 - timing_s/adv:0.0684628626331687 - timing_s/update_actor:125.43432772811502 - timing_s/update_weights:53.64646195154637 - timing_s/step:307.7038074657321 - timing_s/stop_profile:0.00017327163368463516 - timing_per_token_ms/ref:0.07066807241582414 - timing_per_token_ms/gen:0.10488113590643335 - timing_per_token_ms/adv:0.0001098011163018309 - timing_per_token_ms/update_actor:0.20117226591755322 - perf/total_num_tokens:623517 - perf/time_per_step:307.7038074657321 - perf/throughput:1013.1772582460471 +step:4 - global_seqlen/min:322720 - global_seqlen/max:323987 - global_seqlen/minmax_diff:1267 - global_seqlen/balanced_min:323353 - global_seqlen/balanced_max:323354 - global_seqlen/mean:323353.5 - actor/entropy:0.12859423458576202 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.2146770662519158) - perf/max_memory_allocated_gb:np.float64(65.87330293655396) - perf/max_memory_reserved_gb:np.float64(73.822265625) - perf/cpu_memory_used_gb:np.float64(633.0780906677246) - actor/lr:np.float64(1e-06) - training/global_step:4 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:252.455078125 - response_length/max:768.0 - response_length/min:88.0 - response_length/clip_ratio:0.03125 - response_length_non_aborted/mean:252.455078125 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:88.0 - response_length_non_aborted/clip_ratio:0.03125 - response/aborted_ratio:0.0 - prompt_length/mean:168.578125 - prompt_length/max:275.0 - prompt_length/min:127.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:5.78882172703743e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(10.685429918579757) - timing_s/agent_loop/generate_sequences/max:np.float64(34.60890247207135) - timing_s/agent_loop/generate_sequences/mean:np.float64(20.38838475912659) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(34.60890247207135) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:154 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:38.307684062980115 - timing_s/reward:0.0007219910621643066 - timing_s/old_log_prob:50.92320989817381 - timing_s/ref:45.26833815127611 - timing_s/adv:0.1157098663970828 - timing_s/update_actor:130.2316440390423 - timing_s/update_weights:55.4500416405499 - timing_s/step:321.1360242785886 - timing_s/stop_profile:0.00031584594398736954 - timing_per_token_ms/ref:0.06999821890172228 - timing_per_token_ms/gen:0.09878945063705155 - timing_per_token_ms/adv:0.00017892162354371115 - timing_per_token_ms/update_actor:0.20137658018088916 - perf/total_num_tokens:646707 - perf/time_per_step:321.1360242785886 - perf/throughput:1006.9050980075898 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 5} +validation generation end +step:5 - global_seqlen/min:311041 - global_seqlen/max:317705 - global_seqlen/minmax_diff:6664 - global_seqlen/balanced_min:314373 - global_seqlen/balanced_max:314373 - global_seqlen/mean:314373.0 - actor/entropy:0.11806871742010117 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.21450295285391346) - perf/max_memory_allocated_gb:np.float64(65.87330293655396) - perf/max_memory_reserved_gb:np.float64(73.822265625) - perf/cpu_memory_used_gb:np.float64(633.0920104980469) - actor/lr:np.float64(1e-06) - val-aux/openai/gsm8k/reward/mean@1:np.float64(0.0) - val-core/openai/gsm8k/acc/mean@1:np.float64(0.0) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:5 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:240.931640625 - response_length/max:768.0 - response_length/min:87.0 - response_length/clip_ratio:0.01888020895421505 - response_length_non_aborted/mean:240.931640625 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:87.0 - response_length_non_aborted/clip_ratio:0.01888020895421505 - response/aborted_ratio:0.0 - prompt_length/mean:168.408203125 - prompt_length/max:260.0 - prompt_length/min:130.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00022962503135204315 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(9.077454973012209) - timing_s/agent_loop/generate_sequences/max:np.float64(32.85791217163205) - timing_s/agent_loop/generate_sequences/mean:np.float64(18.92793111639791) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(32.85791217163205) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:149 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:37.798188225366175 - timing_s/reward:0.00038691237568855286 - timing_s/old_log_prob:44.98774911649525 - timing_s/ref:46.94722242560238 - timing_s/adv:0.08573572803288698 - timing_s/update_actor:126.64321015495807 - timing_s/update_weights:52.50617145188153 - timing_s/step:309.7617739085108 - timing_s/testing:32.15901133790612 - timing_s/stop_profile:7.286109030246735e-05 - timing_per_token_ms/ref:0.0746680256027114 - timing_per_token_ms/gen:0.10213766608398436 - timing_per_token_ms/adv:0.00013635987828612345 - timing_per_token_ms/update_actor:0.2014218939841495 - perf/total_num_tokens:628746 - perf/time_per_step:309.7617739085108 - perf/throughput:1014.8863626176519 +step:6 - global_seqlen/min:309804 - global_seqlen/max:315725 - global_seqlen/minmax_diff:5921 - global_seqlen/balanced_min:312764 - global_seqlen/balanced_max:312765 - global_seqlen/mean:312764.5 - actor/entropy:0.11925524473190308 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.21461234898885326) - perf/max_memory_allocated_gb:np.float64(65.87330293655396) - perf/max_memory_reserved_gb:np.float64(73.822265625) - perf/cpu_memory_used_gb:np.float64(632.5024299621582) - actor/lr:np.float64(1e-06) - training/global_step:6 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:240.6751251220703 - response_length/max:768.0 - response_length/min:79.0 - response_length/clip_ratio:0.02669270895421505 - response_length_non_aborted/mean:240.6751251220703 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:79.0 - response_length_non_aborted/clip_ratio:0.02669270895421505 - response/aborted_ratio:0.0 - prompt_length/mean:166.5703125 - prompt_length/max:270.0 - prompt_length/min:128.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.586739957332611e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(8.93101360462606) - timing_s/agent_loop/generate_sequences/max:np.float64(31.999577826820314) - timing_s/agent_loop/generate_sequences/mean:np.float64(18.698887176243563) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(31.999577826820314) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:142 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:36.813020741567016 - timing_s/reward:0.0002562897279858589 - timing_s/old_log_prob:45.04324695467949 - timing_s/ref:47.843891732394695 - timing_s/adv:0.09176636021584272 - timing_s/update_actor:125.96022703871131 - timing_s/update_weights:54.325182356871665 - timing_s/step:310.87344992533326 - timing_s/stop_profile:0.00011400599032640457 - timing_per_token_ms/ref:0.07648548945355801 - timing_per_token_ms/gen:0.0995815826831721 - timing_per_token_ms/adv:0.00014670200776597525 - timing_per_token_ms/update_actor:0.20136592714120577 - perf/total_num_tokens:625529 - perf/time_per_step:310.87344992533326 - perf/throughput:1006.0830221272383 +step:7 - global_seqlen/min:315345 - global_seqlen/max:323795 - global_seqlen/minmax_diff:8450 - global_seqlen/balanced_min:319570 - global_seqlen/balanced_max:319570 - global_seqlen/mean:319570.0 - actor/entropy:0.12091823667287827 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.21381616696931857) - perf/max_memory_allocated_gb:np.float64(65.87330293655396) - perf/max_memory_reserved_gb:np.float64(73.822265625) - perf/cpu_memory_used_gb:np.float64(632.5226821899414) - actor/lr:np.float64(1e-06) - training/global_step:7 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:248.2200469970703 - response_length/max:768.0 - response_length/min:80.0 - response_length/clip_ratio:0.02408854104578495 - response_length_non_aborted/mean:248.2200469970703 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:80.0 - response_length_non_aborted/clip_ratio:0.02408854104578495 - response/aborted_ratio:0.0 - prompt_length/mean:167.88671875 - prompt_length/max:258.0 - prompt_length/min:133.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.216818630695343e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(9.902485113590956) - timing_s/agent_loop/generate_sequences/max:np.float64(33.80569928046316) - timing_s/agent_loop/generate_sequences/mean:np.float64(20.285159144414138) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(33.80569928046316) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:154 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:38.38099627196789 - timing_s/reward:0.00031245313584804535 - timing_s/old_log_prob:42.91629869863391 - timing_s/ref:44.90813964419067 - timing_s/adv:0.09593600407242775 - timing_s/update_actor:129.198257451877 - timing_s/update_weights:56.63132025767118 - timing_s/step:312.9049155814573 - timing_s/stop_profile:0.00022598076611757278 - timing_per_token_ms/ref:0.07026338461712718 - timing_per_token_ms/gen:0.10066724090783832 - timing_per_token_ms/adv:0.00015010170552997426 - timing_per_token_ms/update_actor:0.20214390814512784 - perf/total_num_tokens:639140 - perf/time_per_step:312.9049155814573 - perf/throughput:1021.3006702248741 +step:8 - global_seqlen/min:310305 - global_seqlen/max:319516 - global_seqlen/minmax_diff:9211 - global_seqlen/balanced_min:314910 - global_seqlen/balanced_max:314911 - global_seqlen/mean:314910.5 - actor/entropy:0.12208293378353119 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0) - actor/kl_loss:np.float64(0.0) - actor/pg_clipfrac:np.float64(0.0) - actor/ppo_kl:np.float64(0.0) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.0) - perf/mfu/actor:np.float64(0.21311499405001913) - perf/max_memory_allocated_gb:np.float64(65.87330293655396) - perf/max_memory_reserved_gb:np.float64(73.822265625) - perf/cpu_memory_used_gb:np.float64(648.1669216156006) - actor/lr:np.float64(1e-06) - training/global_step:8 - training/epoch:0 - critic/score/mean:0.0 - critic/score/max:0.0 - critic/score/min:0.0 - critic/rewards/mean:0.0 - critic/rewards/max:0.0 - critic/rewards/min:0.0 - critic/advantages/mean:0.0 - critic/advantages/max:0.0 - critic/advantages/min:0.0 - critic/returns/mean:0.0 - critic/returns/max:0.0 - critic/returns/min:0.0 - response_length/mean:242.4830780029297 - response_length/max:768.0 - response_length/min:78.0 - response_length/clip_ratio:0.0234375 - response_length_non_aborted/mean:242.4830780029297 - response_length_non_aborted/max:768.0 - response_length_non_aborted/min:78.0 - response_length_non_aborted/clip_ratio:0.0234375 - response/aborted_ratio:0.0 - prompt_length/mean:167.556640625 - prompt_length/max:259.0 - prompt_length/min:129.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.377976387739182e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(9.95536563359201) - timing_s/agent_loop/generate_sequences/max:np.float64(33.838891653344035) - timing_s/agent_loop/generate_sequences/mean:np.float64(20.62224173213023) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(33.838891653344035) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:150 - timing_s/agent_loop/slowest/response_length:768 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:36.98162783496082 - timing_s/reward:0.0002691242843866348 - timing_s/old_log_prob:45.680936123244464 - timing_s/ref:45.94916301127523 - timing_s/adv:0.09313492756336927 - timing_s/update_actor:127.74078119918704 - timing_s/update_weights:47.056643958203495 - timing_s/step:304.28040032088757 - timing_s/stop_profile:0.0002472829073667526 - timing_per_token_ms/ref:0.0729559081251264 - timing_per_token_ms/gen:0.09929179934961316 - timing_per_token_ms/adv:0.0001478752336987323 - timing_per_token_ms/update_actor:0.20282077161477155 - perf/total_num_tokens:629821 - perf/time_per_step:304.28040032088757 - perf/throughput:1034.9352099836274 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..58b4ec54fa2f6ed38448fc9311264354380a5821 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/requirements.txt @@ -0,0 +1,268 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..52a1e05c23de6496ac88903251f6c5a5315292f8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T04:27:45.300361Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=36763", + "--object-store-name=/tmp/ray/session_2026-02-01_23-22-21_164910_1930058/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-01_23-22-21_164910_1930058/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=60959", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=45868", + "--gcs-address=172.16.34.29:62452", + "--session-name=session_2026-02-01_23-22-21_164910_1930058", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=ac57a802555dae4165dd7d24b2e2c764ea830f18e342964154e6c03f", + "--startup-token=128", + "--worker-launch-time-ms=1770006154733", + "--node-id=bd4e10b69d81888d9a1a4f12d6068c902ba8c1748a63a91825eea4a3", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "182712745984" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "jwnri0taq2i5mkgbllgxk7pjo2rf6mcs" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..e12a6b744817e2451003e468dc854f542045b298 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-01T23:27:45.409937779-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmph8f0s0rx/port-1939239.txt","pid":1939239,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-01T23:27:45.410602373-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":1939239} +{"time":"2026-02-01T23:27:45.410558497-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-1939239-1948818-3315300227/socket","Net":"unix"}} +{"time":"2026-02-01T23:27:45.5871832-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-01T23:27:45.601065215-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"x2j8bpwi","id":"1(@)"} +{"time":"2026-02-01T23:27:46.432473609-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"x2j8bpwi","id":"1(@)"} +{"time":"2026-02-01T23:27:53.037487194-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"y0tietc7voqe"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9fd029d1d170d81d6f853dee0f4b6d105ac03d36 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-01T23:27:45.602522551-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-01T23:27:46.428568973-05:00","level":"INFO","msg":"stream: created new stream","id":"x2j8bpwi"} +{"time":"2026-02-01T23:27:46.428735109-05:00","level":"INFO","msg":"handler: started","stream_id":"x2j8bpwi"} +{"time":"2026-02-01T23:27:46.432445949-05:00","level":"INFO","msg":"stream: started","id":"x2j8bpwi"} +{"time":"2026-02-01T23:27:46.432507095-05:00","level":"INFO","msg":"writer: started","stream_id":"x2j8bpwi"} +{"time":"2026-02-01T23:27:46.43251517-05:00","level":"INFO","msg":"sender: started","stream_id":"x2j8bpwi"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..66455e5ce4f83755fa2a9cb4b1e4681deb2579b9 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-01 23:27:45,317 INFO MainThread:1939239 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-01 23:27:45,317 INFO MainThread:1939239 [wandb_setup.py:_flush():81] Configure stats pid to 1939239 +2026-02-01 23:27:45,317 INFO MainThread:1939239 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-01 23:27:45,317 INFO MainThread:1939239 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug.log +2026-02-01 23:27:45,318 INFO MainThread:1939239 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/logs/debug-internal.log +2026-02-01 23:27:45,318 INFO MainThread:1939239 [wandb_init.py:init():844] calling init triggers +2026-02-01 23:27:45,319 INFO MainThread:1939239 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 210, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 512, 'response_length': 768, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/data/gsm8k/train.parquet', 'val_files': '/home/mshahidul/data/gsm8k/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 512, 'max_response_length': 768, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 210, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': 'checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs', 'max_actor_ckpt_to_keep': None, 'max_critic_ckpt_to_keep': None, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto'}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-01 23:27:45,319 INFO MainThread:1939239 [wandb_init.py:init():892] starting backend +2026-02-01 23:27:45,587 INFO MainThread:1939239 [wandb_init.py:init():895] sending inform_init request +2026-02-01 23:27:45,596 INFO MainThread:1939239 [wandb_init.py:init():903] backend started and connected +2026-02-01 23:27:45,607 INFO MainThread:1939239 [wandb_init.py:init():973] updated telemetry +2026-02-01 23:27:45,626 INFO MainThread:1939239 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-01 23:27:47,049 INFO MainThread:1939239 [wandb_init.py:init():1042] starting run threads in backend +2026-02-01 23:27:48,008 INFO MainThread:1939239 [wandb_run.py:_console_start():2529] atexit reg +2026-02-01 23:27:48,009 INFO MainThread:1939239 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-01 23:27:48,009 INFO MainThread:1939239 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-01 23:27:48,009 INFO MainThread:1939239 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-01 23:27:48,020 INFO MainThread:1939239 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..25215981b2894427eeca8423d2d416d8ae0ac1e6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c98113ea4adcd9c99c95eb8fed04cd0bea10e1a5 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T05:21:02.309741Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=41019", + "--object-store-name=/tmp/ray/session_2026-02-02_00-15-39_151695_2024332/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-02_00-15-39_151695_2024332/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=65081", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=65005", + "--gcs-address=172.16.34.29:52673", + "--session-name=session_2026-02-02_00-15-39_151695_2024332", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=b01d4540d2c0b95124193cb80239505ce9eac1231bf94d0f11a5a04a", + "--startup-token=128", + "--worker-launch-time-ms=1770009351332", + "--node-id=96cbb8428bea5e36d031f78d70f108a32385be6879c0d176f30d0037", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "182775504896" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "2n84l30ep06tk0q7b8mblthpwf2n2aod" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ecc42755b9fa6e89efda243ee76dfa87b2b522c2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug-core.log @@ -0,0 +1,8 @@ +{"time":"2026-02-02T00:21:02.45335853-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpy_qbudj3/port-2032775.txt","pid":2032775,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-02T00:21:02.454633876-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":2032775} +{"time":"2026-02-02T00:21:02.454578731-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2032775-2042152-1333930708/socket","Net":"unix"}} +{"time":"2026-02-02T00:21:02.611177365-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-02T00:21:02.626815711-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"yafkkssq","id":"1(@)"} +{"time":"2026-02-02T00:21:04.191680348-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"yafkkssq","id":"1(@)"} +{"time":"2026-02-02T00:21:10.843998477-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"b72pfl55qz5o"} +{"time":"2026-02-02T00:22:07.0546933-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5e318f1aa5501ab4659078bf8cdcf93ea630962b --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-02T00:21:02.627928473-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-02T00:21:04.188565654-05:00","level":"INFO","msg":"stream: created new stream","id":"yafkkssq"} +{"time":"2026-02-02T00:21:04.188722827-05:00","level":"INFO","msg":"handler: started","stream_id":"yafkkssq"} +{"time":"2026-02-02T00:21:04.191647088-05:00","level":"INFO","msg":"stream: started","id":"yafkkssq"} +{"time":"2026-02-02T00:21:04.191680063-05:00","level":"INFO","msg":"writer: started","stream_id":"yafkkssq"} +{"time":"2026-02-02T00:21:04.191734744-05:00","level":"INFO","msg":"sender: started","stream_id":"yafkkssq"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..63a0abadaeaaef5beef60852f2d1f01f14c8e13b --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-02 00:21:02,326 INFO MainThread:2032775 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-02 00:21:02,326 INFO MainThread:2032775 [wandb_setup.py:_flush():81] Configure stats pid to 2032775 +2026-02-02 00:21:02,326 INFO MainThread:2032775 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-02 00:21:02,326 INFO MainThread:2032775 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug.log +2026-02-02 00:21:02,327 INFO MainThread:2032775 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/logs/debug-internal.log +2026-02-02 00:21:02,327 INFO MainThread:2032775 [wandb_init.py:init():844] calling init triggers +2026-02-02 00:21:02,328 INFO MainThread:2032775 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 512, 'response_length': 768, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 512, 'max_response_length': 768, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': 'checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs', 'max_actor_ckpt_to_keep': None, 'max_critic_ckpt_to_keep': None, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto'}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-02 00:21:02,328 INFO MainThread:2032775 [wandb_init.py:init():892] starting backend +2026-02-02 00:21:02,611 INFO MainThread:2032775 [wandb_init.py:init():895] sending inform_init request +2026-02-02 00:21:02,621 INFO MainThread:2032775 [wandb_init.py:init():903] backend started and connected +2026-02-02 00:21:02,632 INFO MainThread:2032775 [wandb_init.py:init():973] updated telemetry +2026-02-02 00:21:02,655 INFO MainThread:2032775 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-02 00:21:04,469 INFO MainThread:2032775 [wandb_init.py:init():1042] starting run threads in backend +2026-02-02 00:21:05,814 INFO MainThread:2032775 [wandb_run.py:_console_start():2529] atexit reg +2026-02-02 00:21:05,815 INFO MainThread:2032775 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-02 00:21:05,815 INFO MainThread:2032775 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-02 00:21:05,815 INFO MainThread:2032775 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-02 00:21:05,827 INFO MainThread:2032775 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c868712d2c4a0cc97394635bf21c859c007da331 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/config.yaml @@ -0,0 +1,813 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + 5flmgeolvaugfd6drka4upzthr6fhwra: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=42653 + - --object-store-name=/tmp/ray/session_2026-02-02_00-28-21_417242_2058003/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-02_00-28-21_417242_2058003/sockets/raylet + - --redis-address=None + - --metrics-agent-port=54229 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=64526 + - --gcs-address=172.16.34.29:60771 + - --session-name=session_2026-02-02_00-28-21_417242_2058003 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8301 + - --cluster-id=20745c9562b53282dca4372bf95416d3e1020b9a50c2a9fda6f302b1 + - --startup-token=128 + - --worker-launch-time-ms=1770010118445 + - --node-id=fd90a9b6902a0dd544915bad8e3e93a09dfe2d2a2394cca298376922 + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "182788063232" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-02T05:33:48.432936Z" + writerId: 5flmgeolvaugfd6drka4upzthr6fhwra + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "2": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: false + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.6 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: 8192 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 3 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 512 + quantization: null + quantization_config_file: null + response_length: 768 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 512 + max_response_length: 768 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 512 + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 20 + test_freq: 5 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..25215981b2894427eeca8423d2d416d8ae0ac1e6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c816e36e70cb1941325fbf18ef916fdd258f42e6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T05:33:48.432936Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=42653", + "--object-store-name=/tmp/ray/session_2026-02-02_00-28-21_417242_2058003/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-02_00-28-21_417242_2058003/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=54229", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=64526", + "--gcs-address=172.16.34.29:60771", + "--session-name=session_2026-02-02_00-28-21_417242_2058003", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=20745c9562b53282dca4372bf95416d3e1020b9a50c2a9fda6f302b1", + "--startup-token=128", + "--worker-launch-time-ms=1770010118445", + "--node-id=fd90a9b6902a0dd544915bad8e3e93a09dfe2d2a2394cca298376922", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "182788063232" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "5flmgeolvaugfd6drka4upzthr6fhwra" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d36db3b5803a02ddbdedb9c4a80ca513af26e4ff --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":45},"_runtime":45} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..537da34fc840c956ada16dbca5a4fe727706a4c5 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug-core.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T00:33:48.554562314-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpj4h9zle5/port-2066630.txt","pid":2066630,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-02T00:33:48.55545305-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":2066630} +{"time":"2026-02-02T00:33:48.555469153-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2066630-2076405-3129699608/socket","Net":"unix"}} +{"time":"2026-02-02T00:33:48.7195227-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-02T00:33:48.736136851-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"yi6yp3s2","id":"1(@)"} +{"time":"2026-02-02T00:33:50.433651683-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"yi6yp3s2","id":"1(@)"} +{"time":"2026-02-02T00:33:56.483760941-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"3y3rbt99dpjr"} +{"time":"2026-02-02T00:34:35.831501672-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"3y3rbt99dpjr"} +{"time":"2026-02-02T00:34:36.394702432-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"3y3rbt99dpjr"} +{"time":"2026-02-02T00:34:36.397170751-05:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"yi6yp3s2","id":"1(@)"} +{"time":"2026-02-02T00:34:36.400846141-05:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"yi6yp3s2","id":"1(@)"} +{"time":"2026-02-02T00:34:37.951267002-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e300ed543465f95da0d17e2021e7864498635190 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T00:33:48.737426484-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-02T00:33:50.429469078-05:00","level":"INFO","msg":"stream: created new stream","id":"yi6yp3s2"} +{"time":"2026-02-02T00:33:50.429606004-05:00","level":"INFO","msg":"handler: started","stream_id":"yi6yp3s2"} +{"time":"2026-02-02T00:33:50.433623686-05:00","level":"INFO","msg":"stream: started","id":"yi6yp3s2"} +{"time":"2026-02-02T00:33:50.433670159-05:00","level":"INFO","msg":"sender: started","stream_id":"yi6yp3s2"} +{"time":"2026-02-02T00:33:50.433681567-05:00","level":"INFO","msg":"writer: started","stream_id":"yi6yp3s2"} +{"time":"2026-02-02T00:34:36.270316655-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-02T00:34:36.3893553-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2026-02-02T00:34:36.397216025-05:00","level":"INFO","msg":"stream: closing","id":"yi6yp3s2"} +{"time":"2026-02-02T00:34:36.397240866-05:00","level":"INFO","msg":"handler: closed","stream_id":"yi6yp3s2"} +{"time":"2026-02-02T00:34:36.400274468-05:00","level":"INFO","msg":"sender: closed","stream_id":"yi6yp3s2"} +{"time":"2026-02-02T00:34:36.400302144-05:00","level":"INFO","msg":"stream: closed","id":"yi6yp3s2"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0f1cc5baf629105c52c23e8813bb78f94b34ecca --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug.log @@ -0,0 +1,24 @@ +2026-02-02 00:33:48,446 INFO MainThread:2066630 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-02 00:33:48,446 INFO MainThread:2066630 [wandb_setup.py:_flush():81] Configure stats pid to 2066630 +2026-02-02 00:33:48,446 INFO MainThread:2066630 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-02 00:33:48,446 INFO MainThread:2066630 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug.log +2026-02-02 00:33:48,447 INFO MainThread:2066630 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/logs/debug-internal.log +2026-02-02 00:33:48,447 INFO MainThread:2066630 [wandb_init.py:init():844] calling init triggers +2026-02-02 00:33:48,447 INFO MainThread:2066630 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 512, 'response_length': 768, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 512, 'max_response_length': 768, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': 'checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs', 'max_actor_ckpt_to_keep': None, 'max_critic_ckpt_to_keep': None, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto'}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-02 00:33:48,448 INFO MainThread:2066630 [wandb_init.py:init():892] starting backend +2026-02-02 00:33:48,719 INFO MainThread:2066630 [wandb_init.py:init():895] sending inform_init request +2026-02-02 00:33:48,728 INFO MainThread:2066630 [wandb_init.py:init():903] backend started and connected +2026-02-02 00:33:48,741 INFO MainThread:2066630 [wandb_init.py:init():973] updated telemetry +2026-02-02 00:33:48,763 INFO MainThread:2066630 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-02 00:33:50,647 INFO MainThread:2066630 [wandb_init.py:init():1042] starting run threads in backend +2026-02-02 00:33:51,455 INFO MainThread:2066630 [wandb_run.py:_console_start():2529] atexit reg +2026-02-02 00:33:51,455 INFO MainThread:2066630 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-02 00:33:51,455 INFO MainThread:2066630 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-02 00:33:51,456 INFO MainThread:2066630 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-02 00:33:51,470 INFO MainThread:2066630 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-02 00:34:35,828 INFO MainThread:2066630 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/yi6yp3s2 +2026-02-02 00:34:35,829 INFO MainThread:2066630 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-02 00:34:35,830 INFO MainThread:2066630 [wandb_run.py:_restore():2476] restore +2026-02-02 00:34:35,830 INFO MainThread:2066630 [wandb_run.py:_restore():2482] restore done +2026-02-02 00:34:36,394 INFO MainThread:2066630 [wandb_run.py:_footer_sync_info():3871] logging synced files diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1a156ec96052484a3d168d0d13faa8e22d62d60d --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/config.yaml @@ -0,0 +1,813 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + xxgr8v6n1bswa9pz7my37vzchgijz3he: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=33713 + - --object-store-name=/tmp/ray/session_2026-02-02_00-41-24_572915_2084427/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-02_00-41-24_572915_2084427/sockets/raylet + - --redis-address=None + - --metrics-agent-port=64056 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=50957 + - --gcs-address=172.16.34.29:63507 + - --session-name=session_2026-02-02_00-41-24_572915_2084427 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8301 + - --cluster-id=fd1eb3fd7a8cd045ec4d4d1e0539cbe10c6ee6c109bbb084f4df4e1f + - --startup-token=128 + - --worker-launch-time-ms=1770010897022 + - --node-id=8d3720ca1709ad73b4f96f5143f8f8aa65b540dffd6dd46d6434353e + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "182797938688" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-02T05:46:49.595754Z" + writerId: xxgr8v6n1bswa9pz7my37vzchgijz3he + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "2": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: false + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.6 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: 8192 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 3 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 512 + quantization: null + quantization_config_file: null + response_length: 768 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 512 + max_response_length: 768 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 512 + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 20 + test_freq: 5 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..25215981b2894427eeca8423d2d416d8ae0ac1e6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..82221d82059372daa93a7c8ad64c9fb4dc6aff2f --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T05:46:49.595754Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=33713", + "--object-store-name=/tmp/ray/session_2026-02-02_00-41-24_572915_2084427/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-02_00-41-24_572915_2084427/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=64056", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=50957", + "--gcs-address=172.16.34.29:63507", + "--session-name=session_2026-02-02_00-41-24_572915_2084427", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=fd1eb3fd7a8cd045ec4d4d1e0539cbe10c6ee6c109bbb084f4df4e1f", + "--startup-token=128", + "--worker-launch-time-ms=1770010897022", + "--node-id=8d3720ca1709ad73b4f96f5143f8f8aa65b540dffd6dd46d6434353e", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "182797938688" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "xxgr8v6n1bswa9pz7my37vzchgijz3he" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..fd64cb4243828aef0863dd82d39b8388ba9522d3 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":507},"_runtime":507} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2bca7960489c121dc7549370641405e97d33de0d --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug-core.log @@ -0,0 +1,11 @@ +{"time":"2026-02-02T00:46:49.74080983-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmppobhoa5g/port-2092907.txt","pid":2092907,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-02T00:46:49.741820594-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":2092907} +{"time":"2026-02-02T00:46:49.741826909-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2092907-2102378-418657597/socket","Net":"unix"}} +{"time":"2026-02-02T00:46:49.906805527-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-02T00:46:49.920441812-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"iczy37hv","id":"1(@)"} +{"time":"2026-02-02T00:46:50.569415811-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"iczy37hv","id":"1(@)"} +{"time":"2026-02-02T00:46:56.839767196-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"70m8wzhjhp2x"} +{"time":"2026-02-02T00:55:18.798234669-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"70m8wzhjhp2x"} +{"time":"2026-02-02T00:55:19.449957148-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"70m8wzhjhp2x"} +{"time":"2026-02-02T00:55:19.455652429-05:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"iczy37hv","id":"1(@)"} +{"time":"2026-02-02T00:55:19.458695541-05:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"iczy37hv","id":"1(@)"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3797279e18c5b5ea094450067ba728380b6c0283 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T00:46:49.9299993-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-02T00:46:50.567138455-05:00","level":"INFO","msg":"stream: created new stream","id":"iczy37hv"} +{"time":"2026-02-02T00:46:50.567353207-05:00","level":"INFO","msg":"handler: started","stream_id":"iczy37hv"} +{"time":"2026-02-02T00:46:50.569385061-05:00","level":"INFO","msg":"stream: started","id":"iczy37hv"} +{"time":"2026-02-02T00:46:50.569616043-05:00","level":"INFO","msg":"writer: started","stream_id":"iczy37hv"} +{"time":"2026-02-02T00:46:50.569622758-05:00","level":"INFO","msg":"sender: started","stream_id":"iczy37hv"} +{"time":"2026-02-02T00:55:19.347801809-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-02T00:55:19.443372478-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2026-02-02T00:55:19.45568832-05:00","level":"INFO","msg":"stream: closing","id":"iczy37hv"} +{"time":"2026-02-02T00:55:19.45571281-05:00","level":"INFO","msg":"handler: closed","stream_id":"iczy37hv"} +{"time":"2026-02-02T00:55:19.457458354-05:00","level":"INFO","msg":"sender: closed","stream_id":"iczy37hv"} +{"time":"2026-02-02T00:55:19.457484375-05:00","level":"INFO","msg":"stream: closed","id":"iczy37hv"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f4bb0fec82639ef470f6b72163ec0566067052d0 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug.log @@ -0,0 +1,24 @@ +2026-02-02 00:46:49,616 INFO MainThread:2092907 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-02 00:46:49,617 INFO MainThread:2092907 [wandb_setup.py:_flush():81] Configure stats pid to 2092907 +2026-02-02 00:46:49,617 INFO MainThread:2092907 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-02 00:46:49,617 INFO MainThread:2092907 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug.log +2026-02-02 00:46:49,617 INFO MainThread:2092907 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/logs/debug-internal.log +2026-02-02 00:46:49,617 INFO MainThread:2092907 [wandb_init.py:init():844] calling init triggers +2026-02-02 00:46:49,619 INFO MainThread:2092907 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 512, 'response_length': 768, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 512, 'max_response_length': 768, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': 'checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs', 'max_actor_ckpt_to_keep': None, 'max_critic_ckpt_to_keep': None, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto'}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-02 00:46:49,619 INFO MainThread:2092907 [wandb_init.py:init():892] starting backend +2026-02-02 00:46:49,907 INFO MainThread:2092907 [wandb_init.py:init():895] sending inform_init request +2026-02-02 00:46:49,915 INFO MainThread:2092907 [wandb_init.py:init():903] backend started and connected +2026-02-02 00:46:49,927 INFO MainThread:2092907 [wandb_init.py:init():973] updated telemetry +2026-02-02 00:46:49,955 INFO MainThread:2092907 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-02 00:46:50,892 INFO MainThread:2092907 [wandb_init.py:init():1042] starting run threads in backend +2026-02-02 00:46:51,811 INFO MainThread:2092907 [wandb_run.py:_console_start():2529] atexit reg +2026-02-02 00:46:51,811 INFO MainThread:2092907 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-02 00:46:51,812 INFO MainThread:2092907 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-02 00:46:51,812 INFO MainThread:2092907 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-02 00:46:51,824 INFO MainThread:2092907 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-02 00:55:18,793 INFO MainThread:2092907 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/iczy37hv +2026-02-02 00:55:18,795 INFO MainThread:2092907 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-02 00:55:18,796 INFO MainThread:2092907 [wandb_run.py:_restore():2476] restore +2026-02-02 00:55:18,796 INFO MainThread:2092907 [wandb_run.py:_restore():2482] restore done +2026-02-02 00:55:19,449 INFO MainThread:2092907 [wandb_run.py:_footer_sync_info():3871] logging synced files diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..49299d2b1a12bbf4fc1b4693dfd82b9d336b23ee --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/config.yaml @@ -0,0 +1,813 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + ho5m0em6att94itmozw5tkqsycypa49d: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=37229 + - --object-store-name=/tmp/ray/session_2026-02-02_01-04-58_669972_2143225/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-02_01-04-58_669972_2143225/sockets/raylet + - --redis-address=None + - --metrics-agent-port=62665 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=63960 + - --gcs-address=172.16.34.29:52219 + - --session-name=session_2026-02-02_01-04-58_669972_2143225 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8301 + - --cluster-id=e69e6a3849eff6a29cbc49c0543f6ff821633838841bf5c3843aefe9 + - --startup-token=128 + - --worker-launch-time-ms=1770012310847 + - --node-id=d20589a6087ffe9795ba55d56f88aea84cec6674bcfcf206ddcf4377 + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "182827368448" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-02T06:10:21.086218Z" + writerId: ho5m0em6att94itmozw5tkqsycypa49d + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "2": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: false + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.6 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: 8192 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 3 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 512 + quantization: null + quantization_config_file: null + response_length: 768 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 512 + max_response_length: 768 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 512 + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 20 + test_freq: 5 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..25215981b2894427eeca8423d2d416d8ae0ac1e6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7bb5ac985e8e1b730be5dd7445e6f9133f68c34f --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T06:10:21.086218Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=37229", + "--object-store-name=/tmp/ray/session_2026-02-02_01-04-58_669972_2143225/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-02_01-04-58_669972_2143225/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=62665", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=63960", + "--gcs-address=172.16.34.29:52219", + "--session-name=session_2026-02-02_01-04-58_669972_2143225", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=e69e6a3849eff6a29cbc49c0543f6ff821633838841bf5c3843aefe9", + "--startup-token=128", + "--worker-launch-time-ms=1770012310847", + "--node-id=d20589a6087ffe9795ba55d56f88aea84cec6674bcfcf206ddcf4377", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "182827368448" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "ho5m0em6att94itmozw5tkqsycypa49d" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..720fe9ea3dd29e0548ff80ce6bee1ba208392cbb --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":1546},"_runtime":1546} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..3c9fdff4797232b83d5bbab616e23f50919b0e04 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug-core.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T01:10:21.242656358-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpt2r6c1br/port-2151704.txt","pid":2151704,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-02T01:10:21.24690502-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":2151704} +{"time":"2026-02-02T01:10:21.246810272-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2151704-2161125-992638249/socket","Net":"unix"}} +{"time":"2026-02-02T01:10:21.398943056-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-02T01:10:21.41507198-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"xbya534l","id":"1(@)"} +{"time":"2026-02-02T01:10:22.947574054-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"xbya534l","id":"1(@)"} +{"time":"2026-02-02T01:10:29.386042676-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"tw6orgwb7w5g"} +{"time":"2026-02-02T01:36:09.582975996-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"tw6orgwb7w5g"} +{"time":"2026-02-02T01:36:10.237942254-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"tw6orgwb7w5g"} +{"time":"2026-02-02T01:36:10.241336805-05:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"xbya534l","id":"1(@)"} +{"time":"2026-02-02T01:36:10.245221437-05:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"xbya534l","id":"1(@)"} +{"time":"2026-02-02T01:36:11.781642226-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8e38aac3268fb980dc0d172f91dcd8540e6262ec --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T01:10:21.416480605-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-02T01:10:22.94482613-05:00","level":"INFO","msg":"stream: created new stream","id":"xbya534l"} +{"time":"2026-02-02T01:10:22.945091763-05:00","level":"INFO","msg":"handler: started","stream_id":"xbya534l"} +{"time":"2026-02-02T01:10:22.947543232-05:00","level":"INFO","msg":"stream: started","id":"xbya534l"} +{"time":"2026-02-02T01:10:22.947586094-05:00","level":"INFO","msg":"writer: started","stream_id":"xbya534l"} +{"time":"2026-02-02T01:10:22.94759751-05:00","level":"INFO","msg":"sender: started","stream_id":"xbya534l"} +{"time":"2026-02-02T01:36:10.087002146-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-02T01:36:10.231755289-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2026-02-02T01:36:10.241382195-05:00","level":"INFO","msg":"stream: closing","id":"xbya534l"} +{"time":"2026-02-02T01:36:10.241407656-05:00","level":"INFO","msg":"handler: closed","stream_id":"xbya534l"} +{"time":"2026-02-02T01:36:10.243821834-05:00","level":"INFO","msg":"sender: closed","stream_id":"xbya534l"} +{"time":"2026-02-02T01:36:10.24388357-05:00","level":"INFO","msg":"stream: closed","id":"xbya534l"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5a92ab79b143fa414e8a11d6c0e1e960732ea8b4 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug.log @@ -0,0 +1,24 @@ +2026-02-02 01:10:21,106 INFO MainThread:2151704 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-02 01:10:21,106 INFO MainThread:2151704 [wandb_setup.py:_flush():81] Configure stats pid to 2151704 +2026-02-02 01:10:21,107 INFO MainThread:2151704 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-02 01:10:21,107 INFO MainThread:2151704 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug.log +2026-02-02 01:10:21,107 INFO MainThread:2151704 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/logs/debug-internal.log +2026-02-02 01:10:21,107 INFO MainThread:2151704 [wandb_init.py:init():844] calling init triggers +2026-02-02 01:10:21,109 INFO MainThread:2151704 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 512, 'response_length': 768, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 512, 'max_response_length': 768, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': 'checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs', 'max_actor_ckpt_to_keep': None, 'max_critic_ckpt_to_keep': None, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto'}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-02 01:10:21,109 INFO MainThread:2151704 [wandb_init.py:init():892] starting backend +2026-02-02 01:10:21,399 INFO MainThread:2151704 [wandb_init.py:init():895] sending inform_init request +2026-02-02 01:10:21,409 INFO MainThread:2151704 [wandb_init.py:init():903] backend started and connected +2026-02-02 01:10:21,424 INFO MainThread:2151704 [wandb_init.py:init():973] updated telemetry +2026-02-02 01:10:21,460 INFO MainThread:2151704 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-02 01:10:23,263 INFO MainThread:2151704 [wandb_init.py:init():1042] starting run threads in backend +2026-02-02 01:10:24,360 INFO MainThread:2151704 [wandb_run.py:_console_start():2529] atexit reg +2026-02-02 01:10:24,361 INFO MainThread:2151704 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-02 01:10:24,361 INFO MainThread:2151704 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-02 01:10:24,361 INFO MainThread:2151704 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-02 01:10:24,374 INFO MainThread:2151704 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-02 01:36:09,579 INFO MainThread:2151704 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/xbya534l +2026-02-02 01:36:09,580 INFO MainThread:2151704 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-02 01:36:09,581 INFO MainThread:2151704 [wandb_run.py:_restore():2476] restore +2026-02-02 01:36:09,581 INFO MainThread:2151704 [wandb_run.py:_restore():2482] restore done +2026-02-02 01:36:10,237 INFO MainThread:2151704 [wandb_run.py:_footer_sync_info():3871] logging synced files diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d3a0e73d68ea1a3db3081e3def4dab6f9522c9d --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/config.yaml @@ -0,0 +1,815 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + kfar92kzq2tt3vn3peb4zpbz23c0q97e: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=44173 + - --object-store-name=/tmp/ray/session_2026-02-02_09-46-37_229579_2792949/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-02_09-46-37_229579_2792949/sockets/raylet + - --redis-address=None + - --metrics-agent-port=59209 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=65414 + - --gcs-address=172.16.34.29:63816 + - --session-name=session_2026-02-02_09-46-37_229579_2792949 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8301 + - --cluster-id=39a2162a1a775a0c652ad9a25f459ddc87ae0e16c7e2ce61a2db083d + - --startup-token=128 + - --worker-launch-time-ms=1770043610629 + - --node-id=b8834a3d730307500a971f42611c49920f7a913a7105abfd56fbc999 + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "182962978816" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-02T14:52:27.364301Z" + writerId: kfar92kzq2tt3vn3peb4zpbz23c0q97e + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "2": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + - 61 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: false + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.6 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: 8192 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 3 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 1024 + quantization: null + quantization_config_file: null + response_length: 2048 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 1024 + max_response_length: 2048 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 512 + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: /home/mshahidul/readctrl/code/RL_model/RL_model + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + remove_previous_ckpt_in_save: true + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 20 + test_freq: 5 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fbe739ef51a0c17b32729bdaab877e21c32c8a92 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/output.log @@ -0,0 +1,113 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/RL_model/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(-4.68564679482404), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(-4.685646834438651), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(-4.68564679482404) - val-core/multiclinsum/acc/mean@1:np.float64(-4.685646834438651) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 23%|██▎ | 21/90 [6:31:35<21:43:37, 1133.59s/it] +step:1 - global_seqlen/min:654763 - global_seqlen/max:655183 - global_seqlen/minmax_diff:420 - global_seqlen/balanced_min:654973 - global_seqlen/balanced_max:654973 - global_seqlen/mean:654973.0 - actor/entropy:0.5065052509307861 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.000741196616824407) - actor/kl_loss:np.float64(0.0010331022373672263) - actor/pg_clipfrac:np.float64(0.0003378839364813757) - actor/ppo_kl:np.float64(6.813416837303521e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.22174715995788574) - perf/mfu/actor:np.float64(0.2538084600620599) - perf/max_memory_allocated_gb:np.float64(69.58245754241943) - perf/max_memory_reserved_gb:np.float64(76.49609375) - perf/cpu_memory_used_gb:np.float64(761.7325325012207) - actor/lr:np.float64(1e-06) - training/global_step:1 - training/epoch:0 - critic/score/mean:-4.67110013961792 - critic/score/max:0.0 - critic/score/min:-6.6583356857299805 - critic/rewards/mean:-4.67110013961792 - critic/rewards/max:0.0 - critic/rewards/min:-6.6583356857299805 - critic/advantages/mean:0.0010903730290010571 - critic/advantages/max:1.1546990871429443 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0010903730290010571 - critic/returns/max:1.1546990871429443 - critic/returns/min:-1.1546999216079712 - response_length/mean:343.8294372558594 - response_length/max:535.0 - response_length/min:164.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:343.8294372558594 - response_length_non_aborted/max:535.0 - response_length_non_aborted/min:164.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00041771307587623596 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(15.721401959657669) - timing_s/agent_loop/generate_sequences/max:np.float64(35.728235775604844) - timing_s/agent_loop/generate_sequences/mean:np.float64(30.650243707081852) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.728235775604844) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:535 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:532.7778770728037 - timing_s/reward:0.000401531346142292 - timing_s/old_log_prob:67.07459405530244 - timing_s/ref:106.42806424293667 - timing_s/adv:0.1088367709890008 - timing_s/update_actor:226.2403302807361 - timing_s/update_weights:48.76984470244497 - timing_s/step:982.3744567343965 - timing_s/stop_profile:0.0002471841871738434 - timing_per_token_ms/ref:0.08124614620979542 - timing_per_token_ms/update_actor:0.17270966152859438 - timing_per_token_ms/adv:8.308492944671063e-05 - timing_per_token_ms/gen:1.0088159119915543 - perf/total_num_tokens:1309946 - perf/time_per_step:982.3744567343965 - perf/throughput:666.7243793952639 +step:2 - global_seqlen/min:657547 - global_seqlen/max:661064 - global_seqlen/minmax_diff:3517 - global_seqlen/balanced_min:659305 - global_seqlen/balanced_max:659306 - global_seqlen/mean:659305.5 - actor/entropy:0.5050727128982544 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00046617999032605266) - actor/kl_loss:np.float64(0.004400044631135339) - actor/pg_clipfrac:np.float64(0.00031516926840898424) - actor/ppo_kl:np.float64(8.927750469448863e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.17941228300333023) - perf/mfu/actor:np.float64(0.19345588961504842) - perf/max_memory_allocated_gb:np.float64(71.29543161392212) - perf/max_memory_reserved_gb:np.float64(76.966796875) - perf/cpu_memory_used_gb:np.float64(762.1224460601807) - actor/lr:np.float64(1e-06) - training/global_step:2 - training/epoch:0 - critic/score/mean:-4.693542003631592 - critic/score/max:-4.350610733032227 - critic/score/min:-6.654351711273193 - critic/rewards/mean:-4.693542003631592 - critic/rewards/max:-4.350610733032227 - critic/rewards/min:-6.654351711273193 - critic/advantages/mean:-6.344255234580487e-05 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-6.344255234580487e-05 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:349.470703125 - response_length/max:594.0 - response_length/min:196.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:349.470703125 - response_length_non_aborted/max:594.0 - response_length_non_aborted/min:196.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00011888612061738968 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(18.393846109509468) - timing_s/agent_loop/generate_sequences/max:np.float64(37.00908718723804) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.609618558738173) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.00908718723804) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:594 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:543.5005487920716 - timing_s/reward:0.0003321198746562004 - timing_s/old_log_prob:67.04947808012366 - timing_s/ref:66.77672814112157 - timing_s/adv:0.12285151891410351 - timing_s/update_actor:298.7340990975499 - timing_s/update_weights:47.26549215242267 - timing_s/step:1024.3747734036297 - timing_s/stop_profile:0.0001833224669098854 - timing_per_token_ms/ref:0.05064171931003273 - timing_per_token_ms/update_actor:0.22655210604002993 - timing_per_token_ms/adv:9.316736999320004e-05 - timing_per_token_ms/gen:1.0125069139008054 - perf/total_num_tokens:1318611 - perf/time_per_step:1024.3747734036297 - perf/throughput:643.6174700098915 +step:3 - global_seqlen/min:655692 - global_seqlen/max:656374 - global_seqlen/minmax_diff:682 - global_seqlen/balanced_min:656033 - global_seqlen/balanced_max:656033 - global_seqlen/mean:656033.0 - actor/entropy:0.5025427341461182 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00043453267547496075) - actor/kl_loss:np.float64(0.008234596306768555) - actor/pg_clipfrac:np.float64(0.00037463386161107337) - actor/ppo_kl:np.float64(9.679724318327014e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.18687571585178375) - perf/mfu/actor:np.float64(0.2063640764657233) - perf/max_memory_allocated_gb:np.float64(71.29543161392212) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(772.5685653686523) - actor/lr:np.float64(1e-06) - training/global_step:3 - training/epoch:0 - critic/score/mean:-4.6886444091796875 - critic/score/max:-4.166399955749512 - critic/score/min:-6.653900146484375 - critic/rewards/mean:-4.6886444091796875 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.653900146484375 - critic/advantages/mean:0.0004364756459835917 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0004364756459835917 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:345.2096252441406 - response_length/max:514.0 - response_length/min:166.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:345.2096252441406 - response_length_non_aborted/max:514.0 - response_length_non_aborted/min:166.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00019333790987730026 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(15.739830949343741) - timing_s/agent_loop/generate_sequences/max:np.float64(35.993458861485124) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.046125460943585) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.993458861485124) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:514 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:545.5217836154625 - timing_s/reward:0.0002660593017935753 - timing_s/old_log_prob:67.08234614040703 - timing_s/ref:64.75110619980842 - timing_s/adv:0.12250437960028648 - timing_s/update_actor:278.562344907783 - timing_s/update_weights:44.43381206970662 - timing_s/step:1001.3569081621245 - timing_s/stop_profile:0.0003353990614414215 - timing_per_token_ms/ref:0.049350494715821015 - timing_per_token_ms/update_actor:0.21230818031088602 - timing_per_token_ms/adv:9.3367543706099e-05 - timing_per_token_ms/gen:1.0288166226278992 - perf/total_num_tokens:1312066 - perf/time_per_step:1001.3569081621245 - perf/throughput:655.1440297187075 +step:4 - global_seqlen/min:651280 - global_seqlen/max:652968 - global_seqlen/minmax_diff:1688 - global_seqlen/balanced_min:652124 - global_seqlen/balanced_max:652124 - global_seqlen/mean:652124.0 - actor/entropy:0.49776867032051086 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(4.385435810642241e-05) - actor/kl_loss:np.float64(0.011214460246264935) - actor/pg_clipfrac:np.float64(0.00024651542268353904) - actor/ppo_kl:np.float64(-8.540531356023469e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.18060217052698135) - perf/mfu/actor:np.float64(0.19904611575571846) - perf/max_memory_allocated_gb:np.float64(71.29543161392212) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(773.5313568115234) - actor/lr:np.float64(1e-06) - training/global_step:4 - training/epoch:0 - critic/score/mean:-4.687844753265381 - critic/score/max:-3.6663999557495117 - critic/score/min:-6.6585259437561035 - critic/rewards/mean:-4.687844753265381 - critic/rewards/max:-3.6663999557495117 - critic/rewards/min:-6.6585259437561035 - critic/advantages/mean:0.0008609071956016123 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0008609071956016123 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:340.1197814941406 - response_length/max:550.0 - response_length/min:167.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:340.1197814941406 - response_length_non_aborted/max:550.0 - response_length_non_aborted/min:167.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:5.3911469876766205e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(15.360883947461843) - timing_s/agent_loop/generate_sequences/max:np.float64(34.06471131555736) - timing_s/agent_loop/generate_sequences/mean:np.float64(29.25812927080915) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(34.06471131555736) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:550 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:569.0925595201552 - timing_s/reward:0.0002264268696308136 - timing_s/old_log_prob:61.673032318241894 - timing_s/ref:63.75967378076166 - timing_s/adv:0.10188838467001915 - timing_s/update_actor:286.84419742040336 - timing_s/update_weights:39.320767474360764 - timing_s/step:1021.6001568958163 - timing_s/stop_profile:0.0002340683713555336 - timing_per_token_ms/ref:0.048886157985875124 - timing_per_token_ms/update_actor:0.21993071671982886 - timing_per_token_ms/adv:7.812040706216851e-05 - timing_per_token_ms/gen:1.08933081083594 - perf/total_num_tokens:1304248 - perf/time_per_step:1021.6001568958163 - perf/throughput:638.3358455831798 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 5} +validation generation end +step:5 - global_seqlen/min:653686 - global_seqlen/max:655806 - global_seqlen/minmax_diff:2120 - global_seqlen/balanced_min:654746 - global_seqlen/balanced_max:654746 - global_seqlen/mean:654746.0 - actor/entropy:0.5018743276596069 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(3.8751025082701604e-05) - actor/kl_loss:np.float64(0.011246858125862975) - actor/pg_clipfrac:np.float64(0.0003292237252026098) - actor/ppo_kl:np.float64(-2.591675489801067e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.1702505424618721) - perf/mfu/actor:np.float64(0.19242260390075488) - perf/max_memory_allocated_gb:np.float64(71.29543161392212) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(778.2317504882812) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.695310884363511) - val-core/multiclinsum/acc/mean@1:np.float64(-4.695310920405636) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:5 - training/epoch:0 - critic/score/mean:-4.687351226806641 - critic/score/max:-4.166399955749512 - critic/score/min:-6.655036449432373 - critic/rewards/mean:-4.687351226806641 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.655036449432373 - critic/advantages/mean:-0.0014173081144690514 - critic/advantages/max:1.1546990871429443 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-0.0014173081144690514 - critic/returns/max:1.1546990871429443 - critic/returns/min:-1.1546998023986816 - response_length/mean:343.5338439941406 - response_length/max:524.0 - response_length/min:180.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:343.5338439941406 - response_length_non_aborted/max:524.0 - response_length_non_aborted/min:180.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00011871568858623505 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(17.51734335627407) - timing_s/agent_loop/generate_sequences/max:np.float64(36.52718726173043) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.569438475302984) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(36.52718726173043) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:522 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:554.6921116299927 - timing_s/reward:0.00030663609504699707 - timing_s/old_log_prob:61.45196881517768 - timing_s/ref:62.70214543119073 - timing_s/adv:0.11446317937225103 - timing_s/update_actor:298.15522663947195 - timing_s/update_weights:48.96870417520404 - timing_s/step:1026.92679348588 - timing_s/testing:146.7314405143261 - timing_s/stop_profile:0.00017601344734430313 - timing_per_token_ms/ref:0.047882801446049866 - timing_per_token_ms/update_actor:0.22768770381145662 - timing_per_token_ms/adv:8.741036934341793e-05 - timing_per_token_ms/gen:1.0512142324908706 - perf/total_num_tokens:1309492 - perf/time_per_step:1026.92679348588 - perf/throughput:637.5780670572236 +step:6 - global_seqlen/min:654761 - global_seqlen/max:655123 - global_seqlen/minmax_diff:362 - global_seqlen/balanced_min:654942 - global_seqlen/balanced_max:654942 - global_seqlen/mean:654942.0 - actor/entropy:0.5043251514434814 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0007839634642247493) - actor/kl_loss:np.float64(0.011611796061818799) - actor/pg_clipfrac:np.float64(0.0005896374459553044) - actor/ppo_kl:np.float64(5.185746770545544e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.22193153202533722) - perf/mfu/actor:np.float64(0.1787911802671971) - perf/max_memory_allocated_gb:np.float64(71.29543161392212) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(775.9472579956055) - actor/lr:np.float64(1e-06) - training/global_step:6 - training/epoch:0 - critic/score/mean:-4.68715763092041 - critic/score/max:0.0 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.68715763092041 - critic/rewards/max:0.0 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:-0.00022106485266704112 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:-0.00022106485266704112 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:343.7890625 - response_length/max:572.0 - response_length/min:176.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:343.7890625 - response_length_non_aborted/max:572.0 - response_length_non_aborted/min:176.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00012861751019954681 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(15.779205864295363) - timing_s/agent_loop/generate_sequences/max:np.float64(34.84829577431083) - timing_s/agent_loop/generate_sequences/mean:np.float64(29.704547839901352) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(34.84829577431083) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:572 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:552.0134576605633 - timing_s/reward:0.00044271908700466156 - timing_s/old_log_prob:65.30917584709823 - timing_s/ref:64.39921687357128 - timing_s/adv:0.12116355635225773 - timing_s/update_actor:320.6527865920216 - timing_s/update_weights:48.58900824841112 - timing_s/step:1051.9935990590602 - timing_s/stop_profile:0.000228002667427063 - timing_per_token_ms/ref:0.04916406099591359 - timing_per_token_ms/update_actor:0.24479479602164894 - timing_per_token_ms/adv:9.249945518248771e-05 - timing_per_token_ms/gen:1.045361242397764 - perf/total_num_tokens:1309884 - perf/time_per_step:1051.9935990590602 - perf/throughput:622.5722291331458 +step:7 - global_seqlen/min:654596 - global_seqlen/max:657468 - global_seqlen/minmax_diff:2872 - global_seqlen/balanced_min:656031 - global_seqlen/balanced_max:656033 - global_seqlen/mean:656032.0 - actor/entropy:0.5089455842971802 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00027776949355029284) - actor/kl_loss:np.float64(0.014928277174476534) - actor/pg_clipfrac:np.float64(0.000452209441694625) - actor/ppo_kl:np.float64(3.9500911573971585e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.19540779292583466) - perf/mfu/actor:np.float64(0.2052965793716499) - perf/max_memory_allocated_gb:np.float64(71.29543161392212) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(778.6901321411133) - actor/lr:np.float64(1e-06) - training/global_step:7 - training/epoch:1 - critic/score/mean:-4.7015252113342285 - critic/score/max:-4.139372825622559 - critic/score/min:-6.653066635131836 - critic/rewards/mean:-4.7015252113342285 - critic/rewards/max:-4.139372825622559 - critic/rewards/min:-6.653066635131836 - critic/advantages/mean:0.0007143918774090707 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.154699683189392 - critic/returns/mean:0.0007143918774090707 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.154699683189392 - response_length/mean:345.2083435058594 - response_length/max:592.0 - response_length/min:137.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:345.2083435058594 - response_length_non_aborted/max:592.0 - response_length_non_aborted/min:137.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00040270760655403137 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(12.432528534904122) - timing_s/agent_loop/generate_sequences/max:np.float64(35.25175091903657) - timing_s/agent_loop/generate_sequences/mean:np.float64(29.83419538144699) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.25175091903657) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:592 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:696.327248102054 - timing_s/reward:0.0003340458497405052 - timing_s/old_log_prob:63.0634740954265 - timing_s/ref:66.93074960727245 - timing_s/adv:0.10218710917979479 - timing_s/update_actor:279.8528142403811 - timing_s/update_weights:48.70449667610228 - timing_s/step:1155.9910317827016 - timing_s/stop_profile:0.00018958747386932373 - timing_per_token_ms/ref:0.05101180247859285 - timing_per_token_ms/update_actor:0.21329204538832033 - timing_per_token_ms/adv:7.788271698621012e-05 - timing_per_token_ms/gen:1.3132303260826306 - perf/total_num_tokens:1312064 - perf/time_per_step:1155.9910317827016 - perf/throughput:567.5061328013124 +step:8 - global_seqlen/min:659339 - global_seqlen/max:662702 - global_seqlen/minmax_diff:3363 - global_seqlen/balanced_min:661020 - global_seqlen/balanced_max:661021 - global_seqlen/mean:661020.5 - actor/entropy:0.5028678178787231 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-1.4617294558169278e-05) - actor/kl_loss:np.float64(0.021420265819566946) - actor/pg_clipfrac:np.float64(0.00029503575418251177) - actor/ppo_kl:np.float64(-1.5121030060072371e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.1893029883503914) - perf/mfu/actor:np.float64(0.19974892151072873) - perf/max_memory_allocated_gb:np.float64(71.65932321548462) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(792.8120937347412) - actor/lr:np.float64(1e-06) - training/global_step:8 - training/epoch:1 - critic/score/mean:-4.694917678833008 - critic/score/max:-4.166399955749512 - critic/score/min:-6.6552886962890625 - critic/rewards/mean:-4.694917678833008 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.6552886962890625 - critic/advantages/mean:-0.000866191869135946 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-0.000866191869135946 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:351.7037658691406 - response_length/max:533.0 - response_length/min:171.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:351.7037658691406 - response_length_non_aborted/max:533.0 - response_length_non_aborted/min:171.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00013454072177410126 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(17.175686976872385) - timing_s/agent_loop/generate_sequences/max:np.float64(36.96275188494474) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.126731734798646) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(36.96275188494474) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:530 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:668.7541367094964 - timing_s/reward:0.00039832573384046555 - timing_s/old_log_prob:65.6149131488055 - timing_s/ref:65.7256905771792 - timing_s/adv:0.10718224104493856 - timing_s/update_actor:289.83438884746283 - timing_s/update_weights:45.69146551936865 - timing_s/step:1136.5733840502799 - timing_s/stop_profile:0.00021047238260507584 - timing_per_token_ms/ref:0.04971531940172748 - timing_per_token_ms/update_actor:0.2192325267124566 - timing_per_token_ms/adv:8.107331092223204e-05 - timing_per_token_ms/gen:1.2379361195769412 - perf/total_num_tokens:1322041 - perf/time_per_step:1136.5733840502799 - perf/throughput:581.5906911741985 +step:9 - global_seqlen/min:659001 - global_seqlen/max:659717 - global_seqlen/minmax_diff:716 - global_seqlen/balanced_min:659359 - global_seqlen/balanced_max:659359 - global_seqlen/mean:659359.0 - actor/entropy:0.5027447938919067 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0008580627403963117) - actor/kl_loss:np.float64(0.026455452976127468) - actor/pg_clipfrac:np.float64(0.00027258153644046007) - actor/ppo_kl:np.float64(-5.577064447000642e-06) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.18188276886940002) - perf/mfu/actor:np.float64(0.19436610760867112) - perf/max_memory_allocated_gb:np.float64(71.65932321548462) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(782.687126159668) - actor/lr:np.float64(1e-06) - training/global_step:9 - training/epoch:1 - critic/score/mean:-4.687901496887207 - critic/score/max:0.0 - critic/score/min:-6.6585259437561035 - critic/rewards/mean:-4.687901496887207 - critic/rewards/max:0.0 - critic/rewards/min:-6.6585259437561035 - critic/advantages/mean:-0.0027746346313506365 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-0.0027746346313506365 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:349.5403747558594 - response_length/max:553.0 - response_length/min:169.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:349.5403747558594 - response_length_non_aborted/max:553.0 - response_length_non_aborted/min:169.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.132176637649536e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(16.295462796464562) - timing_s/agent_loop/generate_sequences/max:np.float64(37.19654051028192) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.63057109564761) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.19654051028192) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:553 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:672.9020651578903 - timing_s/reward:0.0001805722713470459 - timing_s/old_log_prob:64.229261983186 - timing_s/ref:68.27041580341756 - timing_s/adv:0.12314794585108757 - timing_s/update_actor:297.0961298234761 - timing_s/update_weights:45.48334850091487 - timing_s/step:1149.0070040849969 - timing_s/stop_profile:0.00020004063844680786 - timing_per_token_ms/ref:0.051770291907305095 - timing_per_token_ms/update_actor:0.2252916315872507 - timing_per_token_ms/adv:9.338459462226767e-05 - timing_per_token_ms/gen:1.2533238686926849 - perf/total_num_tokens:1318718 - perf/time_per_step:1149.0070040849969 - perf/throughput:573.8511581355203 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 10} +validation generation end +step:10 - global_seqlen/min:650490 - global_seqlen/max:651709 - global_seqlen/minmax_diff:1219 - global_seqlen/balanced_min:651099 - global_seqlen/balanced_max:651100 - global_seqlen/mean:651099.5 - actor/entropy:0.5069866180419922 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0008170801866071993) - actor/kl_loss:np.float64(0.030172128269138433) - actor/pg_clipfrac:np.float64(0.0003385392313551468) - actor/ppo_kl:np.float64(6.086469511501491e-06) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2098386287689209) - perf/mfu/actor:np.float64(0.187684566610385) - perf/max_memory_allocated_gb:np.float64(71.65932321548462) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(778.098503112793) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.706603291455437) - val-core/multiclinsum/acc/mean@1:np.float64(-4.706603331683899) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:10 - training/epoch:1 - critic/score/mean:-4.667698383331299 - critic/score/max:0.0 - critic/score/min:-6.654905796051025 - critic/rewards/mean:-4.667698383331299 - critic/rewards/max:0.0 - critic/rewards/min:-6.654905796051025 - critic/advantages/mean:0.002126588486135006 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.002126588486135006 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:338.7857971191406 - response_length/max:505.0 - response_length/min:152.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:338.7857971191406 - response_length_non_aborted/max:505.0 - response_length_non_aborted/min:152.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.934868335723877e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(14.223829651251435) - timing_s/agent_loop/generate_sequences/max:np.float64(35.41638055164367) - timing_s/agent_loop/generate_sequences/mean:np.float64(30.505125688131255) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.41638055164367) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:489 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:529.0299312742427 - timing_s/reward:0.0001901639625430107 - timing_s/old_log_prob:66.76588043943048 - timing_s/ref:75.8792326003313 - timing_s/adv:0.10744902398437262 - timing_s/update_actor:303.76764200627804 - timing_s/update_weights:50.84769791178405 - timing_s/step:1027.2355805942789 - timing_s/testing:341.63818121515214 - timing_s/stop_profile:0.00017961766570806503 - timing_per_token_ms/ref:0.05827007438980625 - timing_per_token_ms/update_actor:0.2332728269690562 - timing_per_token_ms/adv:8.25135205789381e-05 - timing_per_token_ms/gen:1.0166321042983286 - perf/total_num_tokens:1302199 - perf/time_per_step:1027.2355805942789 - perf/throughput:633.8365924039783 +step:11 - global_seqlen/min:654659 - global_seqlen/max:655879 - global_seqlen/minmax_diff:1220 - global_seqlen/balanced_min:655269 - global_seqlen/balanced_max:655269 - global_seqlen/mean:655269.0 - actor/entropy:0.5055292844772339 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00013089372411438215) - actor/kl_loss:np.float64(0.037213679713507496) - actor/pg_clipfrac:np.float64(0.00032693695842075005) - actor/ppo_kl:np.float64(-4.3153339523390365e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.19852808862924576) - perf/mfu/actor:np.float64(0.13317388029200594) - perf/max_memory_allocated_gb:np.float64(71.65932321548462) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(776.315746307373) - actor/lr:np.float64(1e-06) - training/global_step:11 - training/epoch:1 - critic/score/mean:-4.683182239532471 - critic/score/max:-4.333066463470459 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.683182239532471 - critic/rewards/max:-4.333066463470459 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:-0.00024311651941388845 - critic/advantages/max:1.1546906232833862 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-0.00024311651941388845 - critic/returns/max:1.1546906232833862 - critic/returns/min:-1.1546998023986816 - response_length/mean:344.21484375 - response_length/max:512.0 - response_length/min:171.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:344.21484375 - response_length_non_aborted/max:512.0 - response_length_non_aborted/min:171.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.311176836490631e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(15.763386534526944) - timing_s/agent_loop/generate_sequences/max:np.float64(34.962317341938615) - timing_s/agent_loop/generate_sequences/mean:np.float64(30.0605063730812) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(34.962317341938615) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:504 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:534.7032260540873 - timing_s/reward:0.00032849889248609543 - timing_s/old_log_prob:67.4622592665255 - timing_s/ref:69.89597464632243 - timing_s/adv:0.12777969427406788 - timing_s/update_actor:430.46078172698617 - timing_s/update_weights:59.47819851152599 - timing_s/step:1162.9855499956757 - timing_s/stop_profile:0.0002899486571550369 - timing_per_token_ms/ref:0.05333380233638584 - timing_per_token_ms/update_actor:0.32846112186520815 - timing_per_token_ms/adv:9.750170866778977e-05 - timing_per_token_ms/gen:1.0113279127355947 - perf/total_num_tokens:1310538 - perf/time_per_step:1162.9855499956757 - perf/throughput:563.4369231865662 +step:12 - global_seqlen/min:659860 - global_seqlen/max:660805 - global_seqlen/minmax_diff:945 - global_seqlen/balanced_min:660332 - global_seqlen/balanced_max:660333 - global_seqlen/mean:660332.5 - actor/entropy:0.501306414604187 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(6.467529844182002e-05) - actor/kl_loss:np.float64(0.04175983710835378) - actor/pg_clipfrac:np.float64(0.00040375756968084414) - actor/ppo_kl:np.float64(0.00022953477332521288) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.19989632815122604) - perf/mfu/actor:np.float64(0.17476189281203094) - perf/max_memory_allocated_gb:np.float64(71.65932321548462) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(776.4099159240723) - actor/lr:np.float64(1e-06) - training/global_step:12 - training/epoch:1 - critic/score/mean:-4.684951305389404 - critic/score/max:-4.257299900054932 - critic/score/min:-6.649450778961182 - critic/rewards/mean:-4.684951305389404 - critic/rewards/max:-4.257299900054932 - critic/rewards/min:-6.649450778961182 - critic/advantages/mean:0.0016807981301099062 - critic/advantages/max:1.1546969413757324 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0016807981301099062 - critic/returns/max:1.1546969413757324 - critic/returns/min:-1.1546998023986816 - response_length/mean:350.8079528808594 - response_length/max:542.0 - response_length/min:161.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:350.8079528808594 - response_length_non_aborted/max:542.0 - response_length_non_aborted/min:161.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.740330278873444e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(15.377726320177317) - timing_s/agent_loop/generate_sequences/max:np.float64(36.23258405178785) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.11069919390987) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(36.23258405178785) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:542 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:513.8298309054226 - timing_s/reward:0.00028319843113422394 - timing_s/old_log_prob:67.87295804359019 - timing_s/ref:70.22551577910781 - timing_s/adv:0.13798057287931442 - timing_s/update_actor:330.85904132016003 - timing_s/update_weights:60.60802887752652 - timing_s/step:1044.4931241758168 - timing_s/stop_profile:0.00021800026297569275 - timing_per_token_ms/ref:0.053174359719616864 - timing_per_token_ms/update_actor:0.250524577633359 - timing_per_token_ms/adv:0.00010447810222828229 - timing_per_token_ms/gen:0.953583396410857 - perf/total_num_tokens:1320665 - perf/time_per_step:1044.4931241758168 - perf/throughput:632.2037787669036 +step:13 - global_seqlen/min:664384 - global_seqlen/max:665375 - global_seqlen/minmax_diff:991 - global_seqlen/balanced_min:664879 - global_seqlen/balanced_max:664880 - global_seqlen/mean:664879.5 - actor/entropy:0.5033592581748962 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00047870623469255724) - actor/kl_loss:np.float64(0.04223046444046) - actor/pg_clipfrac:np.float64(0.0004591864805358152) - actor/ppo_kl:np.float64(1.3223234759607294e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.21367954462766647) - perf/mfu/actor:np.float64(0.19793449345158742) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(776.4014739990234) - actor/lr:np.float64(1e-06) - training/global_step:13 - training/epoch:2 - critic/score/mean:-4.691803455352783 - critic/score/max:-4.166399955749512 - critic/score/min:-6.653242111206055 - critic/rewards/mean:-4.691803455352783 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.653242111206055 - critic/advantages/mean:0.00033822489785961807 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.00033822489785961807 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:356.728515625 - response_length/max:556.0 - response_length/min:169.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:356.728515625 - response_length_non_aborted/max:556.0 - response_length_non_aborted/min:169.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0004950203001499176 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(17.481259025633335) - timing_s/agent_loop/generate_sequences/max:np.float64(37.703203953802586) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.515622308833066) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.703203953802586) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:556 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:716.6196444109082 - timing_s/reward:0.0003485623747110367 - timing_s/old_log_prob:68.97060603648424 - timing_s/ref:70.52237541228533 - timing_s/adv:0.1193194929510355 - timing_s/update_actor:294.2539288774133 - timing_s/update_weights:60.702906703576446 - timing_s/step:1212.1629219111055 - timing_s/stop_profile:0.0002684053033590317 - timing_per_token_ms/ref:0.05303395232691437 - timing_per_token_ms/update_actor:0.22128365281033124 - timing_per_token_ms/adv:8.973016385001756e-05 - timing_per_token_ms/gen:1.307855209853191 - perf/total_num_tokens:1329759 - perf/time_per_step:1212.1629219111055 - perf/throughput:548.5067130677004 +step:14 - global_seqlen/min:661831 - global_seqlen/max:664860 - global_seqlen/minmax_diff:3029 - global_seqlen/balanced_min:663345 - global_seqlen/balanced_max:663346 - global_seqlen/mean:663345.5 - actor/entropy:0.5079573392868042 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0010451093313103427) - actor/kl_loss:np.float64(0.043824755353853106) - actor/pg_clipfrac:np.float64(0.00040754955534794135) - actor/ppo_kl:np.float64(2.7192728339286987e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.21206165850162506) - perf/mfu/actor:np.float64(0.1790141528174528) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(776.4714012145996) - actor/lr:np.float64(1e-06) - training/global_step:14 - training/epoch:2 - critic/score/mean:-4.689581394195557 - critic/score/max:-4.30276346206665 - critic/score/min:-6.6547722816467285 - critic/rewards/mean:-4.689581394195557 - critic/rewards/max:-4.30276346206665 - critic/rewards/min:-6.6547722816467285 - critic/advantages/mean:0.0026998945977538824 - critic/advantages/max:1.1546995639801025 - critic/advantages/min:-1.154699683189392 - critic/returns/mean:0.0026998945977538824 - critic/returns/max:1.1546995639801025 - critic/returns/min:-1.154699683189392 - response_length/mean:354.7311096191406 - response_length/max:545.0 - response_length/min:185.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:354.7311096191406 - response_length_non_aborted/max:545.0 - response_length_non_aborted/min:185.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00016426853835582733 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(16.168270237743855) - timing_s/agent_loop/generate_sequences/max:np.float64(36.204962929710746) - timing_s/agent_loop/generate_sequences/mean:np.float64(30.287923354518473) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(36.204962929710746) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:545 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:600.4213687945157 - timing_s/reward:0.0003162045031785965 - timing_s/old_log_prob:68.50302020087838 - timing_s/ref:70.06671695038676 - timing_s/adv:0.10495083034038544 - timing_s/update_actor:324.4962479416281 - timing_s/update_weights:64.30404259823263 - timing_s/step:1128.818719709292 - timing_s/stop_profile:0.00018018856644630432 - timing_per_token_ms/ref:0.05281313957084714 - timing_per_token_ms/update_actor:0.24459067555416303 - timing_per_token_ms/adv:7.910721512423424e-05 - timing_per_token_ms/gen:1.1019595035018008 - perf/total_num_tokens:1326691 - perf/time_per_step:1128.818719709292 - perf/throughput:587.6457294850968 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 15} +validation generation end +step:15 - global_seqlen/min:656989 - global_seqlen/max:660919 - global_seqlen/minmax_diff:3930 - global_seqlen/balanced_min:658954 - global_seqlen/balanced_max:658954 - global_seqlen/mean:658954.0 - actor/entropy:0.5073931217193604 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0009976764615657902) - actor/kl_loss:np.float64(0.04644113375494877) - actor/pg_clipfrac:np.float64(0.0004750700103007451) - actor/ppo_kl:np.float64(-1.5362425263750385e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.21429309993982315) - perf/mfu/actor:np.float64(0.18851965639044999) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(810.3636436462402) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.716425775079166) - val-core/multiclinsum/acc/mean@1:np.float64(-4.716425809024233) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:15 - training/epoch:2 - critic/score/mean:-4.68088436126709 - critic/score/max:0.0 - critic/score/min:-6.652886390686035 - critic/rewards/mean:-4.68088436126709 - critic/rewards/max:0.0 - critic/rewards/min:-6.652886390686035 - critic/advantages/mean:-0.0016181259416043758 - critic/advantages/max:1.1546995639801025 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-0.0016181259416043758 - critic/returns/max:1.1546995639801025 - critic/returns/min:-1.1546998023986816 - response_length/mean:349.0130310058594 - response_length/max:573.0 - response_length/min:183.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:349.0130310058594 - response_length_non_aborted/max:573.0 - response_length_non_aborted/min:183.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00014469586312770844 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(17.807098189368844) - timing_s/agent_loop/generate_sequences/max:np.float64(36.84576861001551) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.51808204243692) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(36.84576861001551) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:573 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:547.1458245106041 - timing_s/reward:0.00031647272408008575 - timing_s/old_log_prob:64.47275621071458 - timing_s/ref:67.5187945663929 - timing_s/adv:0.11124299094080925 - timing_s/update_actor:306.0460636783391 - timing_s/update_weights:48.784795604646206 - timing_s/step:1034.988062640652 - timing_s/testing:123.44360194914043 - timing_s/stop_profile:0.00019297190010547638 - timing_per_token_ms/ref:0.051231796579422006 - timing_per_token_ms/update_actor:0.23222111382459104 - timing_per_token_ms/adv:8.440876824543841e-05 - timing_per_token_ms/gen:1.0206344985312081 - perf/total_num_tokens:1317908 - perf/time_per_step:1034.988062640652 - perf/throughput:636.6778746401724 +step:16 - global_seqlen/min:653926 - global_seqlen/max:657430 - global_seqlen/minmax_diff:3504 - global_seqlen/balanced_min:655678 - global_seqlen/balanced_max:655678 - global_seqlen/mean:655678.0 - actor/entropy:0.5067970752716064 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0007227528246100214) - actor/kl_loss:np.float64(0.052378803646812834) - actor/pg_clipfrac:np.float64(0.00048831862386578) - actor/ppo_kl:np.float64(0.00015560330166408676) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.21938753128051758) - perf/mfu/actor:np.float64(0.16389540511156236) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(819.0140113830566) - actor/lr:np.float64(1e-06) - training/global_step:16 - training/epoch:2 - critic/score/mean:-4.682726860046387 - critic/score/max:-3.7316999435424805 - critic/score/min:-6.654204845428467 - critic/rewards/mean:-4.682726860046387 - critic/rewards/max:-3.7316999435424805 - critic/rewards/min:-6.654204845428467 - critic/advantages/mean:0.0011771399294957519 - critic/advantages/max:1.1546980142593384 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0011771399294957519 - critic/returns/max:1.1546980142593384 - critic/returns/min:-1.1546998023986816 - response_length/mean:344.7474060058594 - response_length/max:545.0 - response_length/min:165.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:344.7474060058594 - response_length_non_aborted/max:545.0 - response_length_non_aborted/min:165.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:5.952268838882446e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(14.682694755494595) - timing_s/agent_loop/generate_sequences/max:np.float64(34.45234371162951) - timing_s/agent_loop/generate_sequences/mean:np.float64(29.64641257870365) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(34.45234371162951) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:519 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:523.5520031172782 - timing_s/reward:0.00042813830077648163 - timing_s/old_log_prob:61.26207252778113 - timing_s/ref:63.39221139624715 - timing_s/adv:0.10249279625713825 - timing_s/update_actor:350.0872738417238 - timing_s/update_weights:49.35922866500914 - timing_s/step:1048.6166824083775 - timing_s/stop_profile:0.00023597851395606995 - timing_per_token_ms/ref:0.048340962634286305 - timing_per_token_ms/update_actor:0.2669658535452797 - timing_per_token_ms/adv:7.815787342044285e-05 - timing_per_token_ms/gen:0.9887070150949862 - perf/total_num_tokens:1311356 - perf/time_per_step:1048.6166824083775 - perf/throughput:625.2790090026912 +step:17 - global_seqlen/min:650738 - global_seqlen/max:653700 - global_seqlen/minmax_diff:2962 - global_seqlen/balanced_min:652219 - global_seqlen/balanced_max:652219 - global_seqlen/mean:652219.0 - actor/entropy:0.5095911622047424 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0005952033358577564) - actor/kl_loss:np.float64(0.05382113430338601) - actor/pg_clipfrac:np.float64(0.0005442177907752921) - actor/ppo_kl:np.float64(3.860618297342929e-06) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.24933938682079315) - perf/mfu/actor:np.float64(0.1916593072208419) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(823.3617477416992) - actor/lr:np.float64(1e-06) - training/global_step:17 - training/epoch:2 - critic/score/mean:-4.695258140563965 - critic/score/max:0.0 - critic/score/min:-6.6583356857299805 - critic/rewards/mean:-4.695258140563965 - critic/rewards/max:0.0 - critic/rewards/min:-6.6583356857299805 - critic/advantages/mean:-0.0011420475784689188 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-0.0011420475784689188 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:340.2434997558594 - response_length/max:585.0 - response_length/min:165.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:340.2434997558594 - response_length_non_aborted/max:585.0 - response_length_non_aborted/min:165.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.73891806602478e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(14.333692356944084) - timing_s/agent_loop/generate_sequences/max:np.float64(34.48998008295894) - timing_s/agent_loop/generate_sequences/mean:np.float64(28.918804073370364) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(34.48998008295894) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:585 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:715.0717759076506 - timing_s/reward:0.00028832443058490753 - timing_s/old_log_prob:65.24721425026655 - timing_s/ref:66.79824942164123 - timing_s/adv:0.11138137616217136 - timing_s/update_actor:297.9665198419243 - timing_s/update_weights:45.81990054436028 - timing_s/step:1191.6911746170372 - timing_s/stop_profile:0.00020736083388328552 - timing_per_token_ms/ref:0.051208451012344954 - timing_per_token_ms/update_actor:0.22842520674951536 - timing_per_token_ms/adv:8.538648533864496e-05 - timing_per_token_ms/gen:1.3682598933584837 - perf/total_num_tokens:1304438 - perf/time_per_step:1191.6911746170372 - perf/throughput:547.3053874126387 +step:18 - global_seqlen/min:645997 - global_seqlen/max:651417 - global_seqlen/minmax_diff:5420 - global_seqlen/balanced_min:648707 - global_seqlen/balanced_max:648707 - global_seqlen/mean:648707.0 - actor/entropy:0.5120618939399719 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00024703459348529577) - actor/kl_loss:np.float64(0.056239318335428834) - actor/pg_clipfrac:np.float64(0.0005660099583716752) - actor/ppo_kl:np.float64(0.00010718490994558276) - actor/pg_clipfrac_lower:np.float64(4.2005493317750125e-06) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2968503162264824) - perf/mfu/actor:np.float64(0.23542386524571066) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(818.4193458557129) - actor/lr:np.float64(1e-06) - training/global_step:18 - training/epoch:2 - critic/score/mean:-4.697525501251221 - critic/score/max:-4.257299900054932 - critic/score/min:-6.6547722816467285 - critic/rewards/mean:-4.697525501251221 - critic/rewards/max:-4.257299900054932 - critic/rewards/min:-6.6547722816467285 - critic/advantages/mean:0.0010697898687794805 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0010697898687794805 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:335.6705627441406 - response_length/max:518.0 - response_length/min:181.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:335.6705627441406 - response_length_non_aborted/max:518.0 - response_length_non_aborted/min:181.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.849691271781921e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(16.32511654496193) - timing_s/agent_loop/generate_sequences/max:np.float64(33.651470052078366) - timing_s/agent_loop/generate_sequences/mean:np.float64(28.978435764376627) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(33.651470052078366) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:518 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:691.060277685523 - timing_s/reward:0.0003899112343788147 - timing_s/old_log_prob:62.73976634815335 - timing_s/ref:64.10997437499464 - timing_s/adv:0.10064641945064068 - timing_s/update_actor:241.33290633000433 - timing_s/update_weights:49.02811253257096 - timing_s/step:1109.2667649816722 - timing_s/stop_profile:0.0002564806491136551 - timing_per_token_ms/ref:0.04941366007688728 - timing_per_token_ms/update_actor:0.18601071541543743 - timing_per_token_ms/adv:7.757463650819297e-05 - timing_per_token_ms/gen:1.34032909421347 - perf/total_num_tokens:1297414 - perf/time_per_step:1109.2667649816722 - perf/throughput:584.8070279205726 +step:19 - global_seqlen/min:655688 - global_seqlen/max:656995 - global_seqlen/minmax_diff:1307 - global_seqlen/balanced_min:656341 - global_seqlen/balanced_max:656342 - global_seqlen/mean:656341.5 - actor/entropy:0.5066402554512024 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0001156257764402327) - actor/kl_loss:np.float64(0.057001512963324785) - actor/pg_clipfrac:np.float64(0.00046539987139719113) - actor/ppo_kl:np.float64(0.00012311842620723232) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.213738851249218) - perf/mfu/actor:np.float64(0.18301397558094565) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(819.3828506469727) - actor/lr:np.float64(1e-06) - training/global_step:19 - training/epoch:3 - critic/score/mean:-4.697227478027344 - critic/score/max:-4.166399955749512 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.697227478027344 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:5.53989848413039e-05 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:5.53989848413039e-05 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:345.611328125 - response_length/max:571.0 - response_length/min:173.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:345.611328125 - response_length_non_aborted/max:571.0 - response_length_non_aborted/min:173.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0004153996706008911 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(16.570175739005208) - timing_s/agent_loop/generate_sequences/max:np.float64(35.85019491426647) - timing_s/agent_loop/generate_sequences/mean:np.float64(30.900087094858463) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.85019491426647) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:571 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:517.053280102089 - timing_s/reward:0.00021444261074066162 - timing_s/old_log_prob:61.01270724274218 - timing_s/ref:67.03032151982188 - timing_s/adv:0.12359076738357544 - timing_s/update_actor:314.1222662795335 - timing_s/update_weights:43.691157491877675 - timing_s/step:1003.9673904255033 - timing_s/stop_profile:0.00012782402336597443 - timing_per_token_ms/ref:0.05106360143295973 - timing_per_token_ms/update_actor:0.239297885536366 - timing_per_token_ms/adv:9.41512668203789e-05 - timing_per_token_ms/gen:0.973993621850791 - perf/total_num_tokens:1312683 - perf/time_per_step:1003.9673904255033 - perf/throughput:653.747827129951 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 20} +validation generation end +local_global_step_folder: /home/mshahidul/readctrl/code/RL_model/RL_model/global_step_20 +Warning: remove_previous_ckpt_in_save is deprecated, set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead +step:20 - global_seqlen/min:659540 - global_seqlen/max:659663 - global_seqlen/minmax_diff:123 - global_seqlen/balanced_min:659601 - global_seqlen/balanced_max:659602 - global_seqlen/mean:659601.5 - actor/entropy:0.5037832260131836 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0008146971328339275) - actor/kl_loss:np.float64(0.06021132219272356) - actor/pg_clipfrac:np.float64(0.0004640529647682949) - actor/ppo_kl:np.float64(8.042591124043005e-06) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.24844252318143845) - perf/mfu/actor:np.float64(0.20304556513910235) - perf/max_memory_allocated_gb:np.float64(71.833993434906) - perf/max_memory_reserved_gb:np.float64(77.03515625) - perf/cpu_memory_used_gb:np.float64(821.3580532073975) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.72257199567907) - val-core/multiclinsum/acc/mean@1:np.float64(-4.722572024942353) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:20 - training/epoch:3 - critic/score/mean:-4.691334247589111 - critic/score/max:0.0 - critic/score/min:-6.659307956695557 - critic/rewards/mean:-4.691334247589111 - critic/rewards/max:0.0 - critic/rewards/min:-6.659307956695557 - critic/advantages/mean:0.002101339166983962 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.002101339166983962 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:349.8561096191406 - response_length/max:598.0 - response_length/min:161.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:349.8561096191406 - response_length_non_aborted/max:598.0 - response_length_non_aborted/min:161.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.566064596176147e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(14.487292733043432) - timing_s/agent_loop/generate_sequences/max:np.float64(35.15694830007851) - timing_s/agent_loop/generate_sequences/mean:np.float64(30.005215190287952) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.15694830007851) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:598 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:528.3028058838099 - timing_s/reward:0.0006378665566444397 - timing_s/old_log_prob:56.14322682842612 - timing_s/ref:65.809225467965 - timing_s/adv:0.09716053120791912 - timing_s/update_actor:284.4798775110394 - timing_s/update_weights:46.110762003809214 - timing_s/step:981.8361614625901 - timing_s/testing:118.77160589396954 - timing_s/save_checkpoint:140.1031941920519 - timing_s/stop_profile:0.0003814995288848877 - timing_per_token_ms/ref:0.04988559415644522 - timing_per_token_ms/update_actor:0.21564526271623047 - timing_per_token_ms/adv:7.365093257665357e-05 - timing_per_token_ms/gen:0.9831102553017701 - perf/total_num_tokens:1319203 - perf/time_per_step:981.8361614625901 - perf/throughput:671.8040401133994 +step:21 - global_seqlen/min:664883 - global_seqlen/max:667988 - global_seqlen/minmax_diff:3105 - global_seqlen/balanced_min:666435 - global_seqlen/balanced_max:666436 - global_seqlen/mean:666435.5 - actor/entropy:0.5030105710029602 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00022624363070160303) - actor/kl_loss:np.float64(0.06610695226117969) - actor/pg_clipfrac:np.float64(0.0004282219254794957) - actor/ppo_kl:np.float64(9.36621105059506e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2528355270624161) - perf/mfu/actor:np.float64(0.14738025345446115) - perf/max_memory_allocated_gb:np.float64(71.93059206008911) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(822.3719940185547) - actor/lr:np.float64(1e-06) - training/global_step:21 - training/epoch:3 - critic/score/mean:-4.698609828948975 - critic/score/max:-3.7316999435424805 - critic/score/min:-6.655873775482178 - critic/rewards/mean:-4.698609828948975 - critic/rewards/max:-3.7316999435424805 - critic/rewards/min:-6.655873775482178 - critic/advantages/mean:0.0004760405281558633 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0004760405281558633 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:358.7545471191406 - response_length/max:575.0 - response_length/min:201.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:358.7545471191406 - response_length_non_aborted/max:575.0 - response_length_non_aborted/min:201.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00010525435209274292 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(18.205780938267708) - timing_s/agent_loop/generate_sequences/max:np.float64(36.16833139024675) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.082134763347614) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(36.16833139024675) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:575 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:541.671341214329 - timing_s/reward:0.0002337731420993805 - timing_s/old_log_prob:60.27375853806734 - timing_s/ref:71.81199766509235 - timing_s/adv:0.13714388571679592 - timing_s/update_actor:395.75280053541064 - timing_s/update_weights:50.803287921473384 - timing_s/step:1121.2797775249928 - timing_s/stop_profile:0.00021426193416118622 - timing_per_token_ms/ref:0.05387768033447524 - timing_per_token_ms/update_actor:0.29691755656429664 - timing_per_token_ms/adv:0.00010289359264084515 - timing_per_token_ms/gen:0.9829857366328626 - perf/total_num_tokens:1332871 - perf/time_per_step:1121.2797775249928 - perf/throughput:594.3525544276084 +step:22 - global_seqlen/min:670364 - global_seqlen/max:671151 - global_seqlen/minmax_diff:787 - global_seqlen/balanced_min:670757 - global_seqlen/balanced_max:670758 - global_seqlen/mean:670757.5 - actor/entropy:0.502191424369812 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0001432400604244274) - actor/kl_loss:np.float64(0.0755203029451271) - actor/pg_clipfrac:np.float64(0.0005409475894945596) - actor/ppo_kl:np.float64(9.85649163188403e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.23153740912675858) - perf/mfu/actor:np.float64(0.11794788803483659) - perf/max_memory_allocated_gb:np.float64(72.18333101272583) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(819.2932968139648) - actor/lr:np.float64(1e-06) - training/global_step:22 - training/epoch:3 - critic/score/mean:-4.69111967086792 - critic/score/max:-4.260602951049805 - critic/score/min:-6.655036449432373 - critic/rewards/mean:-4.69111967086792 - critic/rewards/max:-4.260602951049805 - critic/rewards/min:-6.655036449432373 - critic/advantages/mean:-0.00011532370263012126 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:-0.00011532370263012126 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:364.3821716308594 - response_length/max:548.0 - response_length/min:208.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:364.3821716308594 - response_length_non_aborted/max:548.0 - response_length_non_aborted/min:208.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00012806616723537445 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(18.279244696721435) - timing_s/agent_loop/generate_sequences/max:np.float64(35.76543524675071) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.123977443913947) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(35.76543524675071) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:515 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:541.0459138844162 - timing_s/reward:0.00026310794055461884 - timing_s/old_log_prob:60.441118674352765 - timing_s/ref:73.22158947587013 - timing_s/adv:0.11318462528288364 - timing_s/update_actor:497.75327671691775 - timing_s/update_weights:51.29783349297941 - timing_s/step:1224.753410840407 - timing_s/stop_profile:0.00015092827379703522 - timing_per_token_ms/ref:0.054581267802350425 - timing_per_token_ms/update_actor:0.3710381745391723 - timing_per_token_ms/adv:8.43707489538944e-05 - timing_per_token_ms/gen:0.9666868216291065 - perf/total_num_tokens:1341515 - perf/time_per_step:1224.753410840407 - perf/throughput:547.6673868086935 +step:23 - global_seqlen/min:670571 - global_seqlen/max:672261 - global_seqlen/minmax_diff:1690 - global_seqlen/balanced_min:671416 - global_seqlen/balanced_max:671416 - global_seqlen/mean:671416.0 - actor/entropy:0.5005431175231934 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0004768258841068031) - actor/kl_loss:np.float64(0.0825445243778328) - actor/pg_clipfrac:np.float64(0.0007546616870968137) - actor/ppo_kl:np.float64(5.753393899491736e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2568960040807724) - perf/mfu/actor:np.float64(0.1492295988320952) - perf/max_memory_allocated_gb:np.float64(72.29845476150513) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(821.7985572814941) - actor/lr:np.float64(1e-06) - training/global_step:23 - training/epoch:3 - critic/score/mean:-4.69627046585083 - critic/score/max:-3.9997334480285645 - critic/score/min:-6.658992767333984 - critic/rewards/mean:-4.69627046585083 - critic/rewards/max:-3.9997334480285645 - critic/rewards/min:-6.658992767333984 - critic/advantages/mean:0.00020102741837035865 - critic/advantages/max:1.154699444770813 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.00020102741837035865 - critic/returns/max:1.154699444770813 - critic/returns/min:-1.1546999216079712 - response_length/mean:365.2395935058594 - response_length/max:555.0 - response_length/min:195.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:365.2395935058594 - response_length_non_aborted/max:555.0 - response_length_non_aborted/min:195.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00011467374861240387 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(17.33453810028732) - timing_s/agent_loop/generate_sequences/max:np.float64(36.433067444711924) - timing_s/agent_loop/generate_sequences/mean:np.float64(31.183637167867953) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(36.433067444711924) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:548 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:545.9259275514632 - timing_s/reward:0.000230494886636734 - timing_s/old_log_prob:66.405916813761 - timing_s/ref:71.94262049533427 - timing_s/adv:0.12862985767424107 - timing_s/update_actor:393.99213512428105 - timing_s/update_weights:60.267181005328894 - timing_s/step:1139.4639489818364 - timing_s/stop_profile:0.00021654926240444183 - timing_per_token_ms/ref:0.05357529497013347 - timing_per_token_ms/update_actor:0.29340389201648537 - timing_per_token_ms/adv:9.578998539969339e-05 - timing_per_token_ms/gen:0.973116118756708 - perf/total_num_tokens:1342832 - perf/time_per_step:1139.4639489818364 - perf/throughput:589.2384753374085 +step:24 - global_seqlen/min:674282 - global_seqlen/max:674374 - global_seqlen/minmax_diff:92 - global_seqlen/balanced_min:674328 - global_seqlen/balanced_max:674328 - global_seqlen/mean:674328.0 - actor/entropy:0.5009976625442505 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0006139262548761802) - actor/kl_loss:np.float64(0.08599221628780167) - actor/pg_clipfrac:np.float64(0.0009783723750539746) - actor/ppo_kl:np.float64(0.0003668959103985496) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3083786517381668) - perf/mfu/actor:np.float64(0.2055716607197697) - perf/max_memory_allocated_gb:np.float64(72.3738784790039) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(821.8049354553223) - actor/lr:np.float64(1e-06) - training/global_step:24 - training/epoch:3 - critic/score/mean:-4.683456897735596 - critic/score/max:0.0 - critic/score/min:-6.659821033477783 - critic/rewards/mean:-4.683456897735596 - critic/rewards/max:0.0 - critic/rewards/min:-6.659821033477783 - critic/advantages/mean:-0.001468533300794661 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:-0.001468533300794661 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546999216079712 - response_length/mean:369.03125 - response_length/max:582.0 - response_length/min:217.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:369.03125 - response_length_non_aborted/max:582.0 - response_length_non_aborted/min:217.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00012300163507461548 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(20.903207078576088) - timing_s/agent_loop/generate_sequences/max:np.float64(37.75168653763831) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.98889480026022) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.75168653763831) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:582 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:522.0464805904776 - timing_s/reward:0.0004603452980518341 - timing_s/old_log_prob:57.33489348180592 - timing_s/ref:62.83766516856849 - timing_s/adv:0.07699690200388432 - timing_s/update_actor:287.53678811714053 - timing_s/update_weights:29.678540041670203 - timing_s/step:960.4047084394842 - timing_s/stop_profile:0.00020066089928150177 - timing_per_token_ms/ref:0.04659280436862216 - timing_per_token_ms/update_actor:0.21320246832190012 - timing_per_token_ms/adv:5.7091580064808455e-05 - timing_per_token_ms/gen:0.9209897828465534 - perf/total_num_tokens:1348656 - perf/time_per_step:960.4047084394842 - perf/throughput:702.1290025698472 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 25} +validation generation end +step:25 - global_seqlen/min:674878 - global_seqlen/max:675495 - global_seqlen/minmax_diff:617 - global_seqlen/balanced_min:675186 - global_seqlen/balanced_max:675187 - global_seqlen/mean:675186.5 - actor/entropy:0.49804309010505676 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0007791539634733162) - actor/kl_loss:np.float64(0.09398268085593979) - actor/pg_clipfrac:np.float64(0.0009164255922466206) - actor/ppo_kl:np.float64(0.00020576581944453665) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.32828205823898315) - perf/mfu/actor:np.float64(0.20642003293129402) - perf/max_memory_allocated_gb:np.float64(72.46385955810547) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(817.1845893859863) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.695695516642402) - val-core/multiclinsum/acc/mean@1:np.float64(-4.695695542727872) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:25 - training/epoch:4 - critic/score/mean:-4.69353723526001 - critic/score/max:-4.132299900054932 - critic/score/min:-6.654054164886475 - critic/rewards/mean:-4.69353723526001 - critic/rewards/max:-4.132299900054932 - critic/rewards/min:-6.654054164886475 - critic/advantages/mean:0.0019021849147975445 - critic/advantages/max:1.1546906232833862 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0019021849147975445 - critic/returns/max:1.1546906232833862 - critic/returns/min:-1.1546998023986816 - response_length/mean:370.1490783691406 - response_length/max:647.0 - response_length/min:243.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:370.1490783691406 - response_length_non_aborted/max:647.0 - response_length_non_aborted/min:243.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.000406850129365921 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.148586886003613) - timing_s/agent_loop/generate_sequences/max:np.float64(39.79267919622362) - timing_s/agent_loop/generate_sequences/mean:np.float64(34.29482706182656) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(39.79267919622362) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:647 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:516.2545729614794 - timing_s/reward:0.0002513658255338669 - timing_s/old_log_prob:56.20504837855697 - timing_s/ref:65.84114354662597 - timing_s/adv:0.12007007002830505 - timing_s/update_actor:286.67125796340406 - timing_s/update_weights:31.883460465818644 - timing_s/step:957.8022147845477 - timing_s/testing:114.3283789344132 - timing_s/stop_profile:0.00018743425607681274 - timing_per_token_ms/ref:0.04875774585734902 - timing_per_token_ms/update_actor:0.21229042491474878 - timing_per_token_ms/adv:8.891622538980345e-05 - timing_per_token_ms/gen:0.9080212487604049 - perf/total_num_tokens:1350373 - perf/time_per_step:957.8022147845477 - perf/throughput:704.9331162299301 +step:26 - global_seqlen/min:679784 - global_seqlen/max:681852 - global_seqlen/minmax_diff:2068 - global_seqlen/balanced_min:680818 - global_seqlen/balanced_max:680818 - global_seqlen/mean:680818.0 - actor/entropy:0.498159795999527 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00021224676796312292) - actor/kl_loss:np.float64(0.1013380087291201) - actor/pg_clipfrac:np.float64(0.0011136207382757373) - actor/ppo_kl:np.float64(0.0001295087629387126) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.36717434227466583) - perf/mfu/actor:np.float64(0.20586205738357236) - perf/max_memory_allocated_gb:np.float64(72.95346212387085) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(817.2504806518555) - actor/lr:np.float64(1e-06) - training/global_step:26 - training/epoch:4 - critic/score/mean:-4.692097187042236 - critic/score/max:-4.166399955749512 - critic/score/min:-6.658269882202148 - critic/rewards/mean:-4.692097187042236 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.658269882202148 - critic/advantages/mean:0.0010039490880444646 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0010039490880444646 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.1546998023986816 - response_length/mean:377.4817810058594 - response_length/max:585.0 - response_length/min:236.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:377.4817810058594 - response_length_non_aborted/max:585.0 - response_length_non_aborted/min:236.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.62393867969513e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(20.17172661423683) - timing_s/agent_loop/generate_sequences/max:np.float64(38.385181967169046) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.60024401163779) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(38.385181967169046) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:585 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:822.7202929798514 - timing_s/reward:0.0002047419548034668 - timing_s/old_log_prob:54.89943413250148 - timing_s/ref:61.82553852535784 - timing_s/adv:0.12437914684414864 - timing_s/update_actor:289.9532284773886 - timing_s/update_weights:32.06444246135652 - timing_s/step:1262.3887617774308 - timing_s/stop_profile:0.0003065243363380432 - timing_per_token_ms/ref:0.04540533485113337 - timing_per_token_ms/update_actor:0.21294474329217838 - timing_per_token_ms/adv:9.134537192329568e-05 - timing_per_token_ms/gen:1.4189431970705184 - perf/total_num_tokens:1361636 - perf/time_per_step:1262.3887617774308 - perf/throughput:539.3093004419773 +step:27 - global_seqlen/min:682301 - global_seqlen/max:683482 - global_seqlen/minmax_diff:1181 - global_seqlen/balanced_min:682891 - global_seqlen/balanced_max:682892 - global_seqlen/mean:682891.5 - actor/entropy:0.5033957958221436 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0003889291207694167) - actor/kl_loss:np.float64(0.11449029017239809) - actor/pg_clipfrac:np.float64(0.0008980958276273062) - actor/ppo_kl:np.float64(0.00032077366230017407) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.32441818714141846) - perf/mfu/actor:np.float64(0.22434641194851285) - perf/max_memory_allocated_gb:np.float64(72.9587550163269) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(810.7514266967773) - actor/lr:np.float64(1e-06) - training/global_step:27 - training/epoch:4 - critic/score/mean:-4.692539691925049 - critic/score/max:-4.270566463470459 - critic/score/min:-6.651907444000244 - critic/rewards/mean:-4.692539691925049 - critic/rewards/max:-4.270566463470459 - critic/rewards/min:-6.651907444000244 - critic/advantages/mean:0.0003721058601513505 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0003721058601513505 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.1546998023986816 - response_length/mean:380.181640625 - response_length/max:570.0 - response_length/min:245.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:380.181640625 - response_length_non_aborted/max:570.0 - response_length_non_aborted/min:245.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00015452317893505096 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.703151009976864) - timing_s/agent_loop/generate_sequences/max:np.float64(40.115123523399234) - timing_s/agent_loop/generate_sequences/mean:np.float64(34.86552871786747) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(40.115123523399234) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:570 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:517.4485868103802 - timing_s/reward:0.0002330206334590912 - timing_s/old_log_prob:54.699649246409535 - timing_s/ref:59.30744280107319 - timing_s/adv:0.10648751445114613 - timing_s/update_actor:267.0670532807708 - timing_s/update_weights:23.746527275070548 - timing_s/step:923.2664021309465 - timing_s/stop_profile:0.00016267411410808563 - timing_per_token_ms/ref:0.04342376702673353 - timing_per_token_ms/update_actor:0.1955413512108225 - timing_per_token_ms/adv:7.796810653752912e-05 - timing_per_token_ms/gen:0.8861043100806396 - perf/total_num_tokens:1365783 - perf/time_per_step:923.2664021309465 - perf/throughput:739.6472983570627 +step:28 - global_seqlen/min:681268 - global_seqlen/max:682333 - global_seqlen/minmax_diff:1065 - global_seqlen/balanced_min:681800 - global_seqlen/balanced_max:681801 - global_seqlen/mean:681800.5 - actor/entropy:0.5010993480682373 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0007103343694628457) - actor/kl_loss:np.float64(0.11914697693039972) - actor/pg_clipfrac:np.float64(0.0009225395442626905) - actor/ppo_kl:np.float64(0.00012638173105491055) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2715914845466614) - perf/mfu/actor:np.float64(0.22225088207238486) - perf/max_memory_allocated_gb:np.float64(72.9587550163269) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(817.0567970275879) - actor/lr:np.float64(1e-06) - training/global_step:28 - training/epoch:4 - critic/score/mean:-4.691234111785889 - critic/score/max:-3.9805026054382324 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.691234111785889 - critic/rewards/max:-3.9805026054382324 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:0.002464129589498043 - critic/advantages/max:1.1546995639801025 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.002464129589498043 - critic/returns/max:1.1546995639801025 - critic/returns/min:-1.1546998023986816 - response_length/mean:378.7610778808594 - response_length/max:603.0 - response_length/min:247.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:378.7610778808594 - response_length_non_aborted/max:603.0 - response_length_non_aborted/min:247.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00011247768998146057 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.06622837483883) - timing_s/agent_loop/generate_sequences/max:np.float64(37.56256848201156) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.53362541972456) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.56256848201156) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:603 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:890.8751140478998 - timing_s/reward:0.000180734321475029 - timing_s/old_log_prob:53.0816009119153 - timing_s/ref:59.53865665756166 - timing_s/adv:0.11059157364070415 - timing_s/update_actor:269.10024466365576 - timing_s/update_weights:22.37174284644425 - timing_s/step:1295.9223125353456 - timing_s/stop_profile:0.00016786344349384308 - timing_per_token_ms/ref:0.043662813871184945 - timing_per_token_ms/update_actor:0.19734529724138936 - timing_per_token_ms/adv:8.110259059703253e-05 - timing_per_token_ms/gen:1.5312999895972166 - perf/total_num_tokens:1363601 - perf/time_per_step:1295.9223125353456 - perf/throughput:526.1121700004716 +step:29 - global_seqlen/min:682636 - global_seqlen/max:685557 - global_seqlen/minmax_diff:2921 - global_seqlen/balanced_min:684096 - global_seqlen/balanced_max:684097 - global_seqlen/mean:684096.5 - actor/entropy:0.5008987188339233 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0002964157877916769) - actor/kl_loss:np.float64(0.1260907830049594) - actor/pg_clipfrac:np.float64(0.0007629560689868716) - actor/ppo_kl:np.float64(6.0955454993442494e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2763803079724312) - perf/mfu/actor:np.float64(0.21696441484665985) - perf/max_memory_allocated_gb:np.float64(72.9587550163269) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(821.4271793365479) - actor/lr:np.float64(1e-06) - training/global_step:29 - training/epoch:4 - critic/score/mean:-4.699860572814941 - critic/score/max:-3.870945453643799 - critic/score/min:-6.6551642417907715 - critic/rewards/mean:-4.699860572814941 - critic/rewards/max:-3.870945453643799 - critic/rewards/min:-6.6551642417907715 - critic/advantages/mean:0.0017186601180583239 - critic/advantages/max:1.1546993255615234 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0017186601180583239 - critic/returns/max:1.1546993255615234 - critic/returns/min:-1.1546998023986816 - response_length/mean:381.7506408691406 - response_length/max:564.0 - response_length/min:261.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:381.7506408691406 - response_length_non_aborted/max:564.0 - response_length_non_aborted/min:261.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.42184168100357e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.975525584071875) - timing_s/agent_loop/generate_sequences/max:np.float64(38.89435680024326) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.95623599763712) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(38.89435680024326) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:564 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:540.644355500117 - timing_s/reward:0.00021813064813613892 - timing_s/old_log_prob:54.063957484439015 - timing_s/ref:62.265074729919434 - timing_s/adv:0.10691413655877113 - timing_s/update_actor:276.55859392136335 - timing_s/update_weights:20.722042279317975 - timing_s/step:955.1321364752948 - timing_s/stop_profile:0.0003051348030567169 - timing_per_token_ms/ref:0.04550898501155863 - timing_per_token_ms/update_actor:0.202134197383968 - timing_per_token_ms/adv:7.814258409359727e-05 - timing_per_token_ms/gen:0.9220206994232589 - perf/total_num_tokens:1368193 - perf/time_per_step:955.1321364752948 - perf/throughput:716.2323137032199 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 30} +validation generation end +step:30 - global_seqlen/min:679465 - global_seqlen/max:680085 - global_seqlen/minmax_diff:620 - global_seqlen/balanced_min:679775 - global_seqlen/balanced_max:679775 - global_seqlen/mean:679775.0 - actor/entropy:0.4965134859085083 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0002428666666673961) - actor/kl_loss:np.float64(0.13018149556592107) - actor/pg_clipfrac:np.float64(0.0012096409506436128) - actor/ppo_kl:np.float64(0.00015287075164375588) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.4259598106145859) - perf/mfu/actor:np.float64(0.21876005983769814) - perf/max_memory_allocated_gb:np.float64(72.9587550163269) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(813.9923877716064) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.744195309807273) - val-core/multiclinsum/acc/mean@1:np.float64(-4.744195344970511) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:30 - training/epoch:4 - critic/score/mean:-4.687333583831787 - critic/score/max:0.0 - critic/score/min:-6.658203125 - critic/rewards/mean:-4.687333583831787 - critic/rewards/max:0.0 - critic/rewards/min:-6.658203125 - critic/advantages/mean:0.0036624385975301266 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0036624385975301266 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:376.1236877441406 - response_length/max:566.0 - response_length/min:236.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:376.1236877441406 - response_length_non_aborted/max:566.0 - response_length_non_aborted/min:236.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:5.9740617871284485e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.066808216273785) - timing_s/agent_loop/generate_sequences/max:np.float64(37.66966500133276) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.9696852546161) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.66966500133276) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:566 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:540.0424392707646 - timing_s/reward:0.00020431913435459137 - timing_s/old_log_prob:53.254052048549056 - timing_s/ref:63.11544557660818 - timing_s/adv:0.1119823008775711 - timing_s/update_actor:272.481114840135 - timing_s/update_weights:22.846587939187884 - timing_s/step:952.6932214610279 - timing_s/testing:117.24726248905063 - timing_s/stop_profile:0.00021729432046413422 - timing_per_token_ms/ref:0.04642377667361126 - timing_per_token_ms/update_actor:0.20042007637831266 - timing_per_token_ms/adv:8.236718096250311e-05 - timing_per_token_ms/gen:0.934772607206123 - perf/total_num_tokens:1359550 - perf/time_per_step:952.6932214610279 - perf/throughput:713.5297960423326 +step:31 - global_seqlen/min:686457 - global_seqlen/max:686486 - global_seqlen/minmax_diff:29 - global_seqlen/balanced_min:686471 - global_seqlen/balanced_max:686472 - global_seqlen/mean:686471.5 - actor/entropy:0.4967544674873352 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00044098349705260984) - actor/kl_loss:np.float64(0.14590224375327426) - actor/pg_clipfrac:np.float64(0.001148006090564498) - actor/ppo_kl:np.float64(0.0005636221812276668) - actor/pg_clipfrac_lower:np.float64(2.9804599156098752e-06) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.35505199432373047) - perf/mfu/actor:np.float64(0.2024766197312401) - perf/max_memory_allocated_gb:np.float64(73.02624225616455) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(814.8349494934082) - actor/lr:np.float64(1e-06) - training/global_step:31 - training/epoch:5 - critic/score/mean:-4.704183101654053 - critic/score/max:0.0 - critic/score/min:-6.658992767333984 - critic/rewards/mean:-4.704183101654053 - critic/rewards/max:0.0 - critic/rewards/min:-6.658992767333984 - critic/advantages/mean:0.0050537786446511745 - critic/advantages/max:1.1546995639801025 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0050537786446511745 - critic/returns/max:1.1546995639801025 - critic/returns/min:-1.1546998023986816 - response_length/mean:384.8431091308594 - response_length/max:592.0 - response_length/min:241.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:384.8431091308594 - response_length_non_aborted/max:592.0 - response_length_non_aborted/min:241.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0003611557185649872 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(21.165874291211367) - timing_s/agent_loop/generate_sequences/max:np.float64(37.86546911671758) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.06962426786534) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.86546911671758) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:592 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:665.7796415518969 - timing_s/reward:0.00027798302471637726 - timing_s/old_log_prob:52.71237541735172 - timing_s/ref:64.08532956428826 - timing_s/adv:0.12198115140199661 - timing_s/update_actor:297.32630097307265 - timing_s/update_weights:30.112091097980738 - timing_s/step:1110.8531722482294 - timing_s/stop_profile:0.00021000578999519348 - timing_per_token_ms/ref:0.046677341713595 - timing_per_token_ms/update_actor:0.21656128548167888 - timing_per_token_ms/adv:8.884647898856442e-05 - timing_per_token_ms/gen:1.1263039109754498 - perf/total_num_tokens:1372943 - perf/time_per_step:1110.8531722482294 - perf/throughput:617.9678081223522 +step:32 - global_seqlen/min:690020 - global_seqlen/max:691410 - global_seqlen/minmax_diff:1390 - global_seqlen/balanced_min:690715 - global_seqlen/balanced_max:690715 - global_seqlen/mean:690715.0 - actor/entropy:0.49535977840423584 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0003213261031002543) - actor/kl_loss:np.float64(0.1570459992314378) - actor/pg_clipfrac:np.float64(0.0010873199765531656) - actor/ppo_kl:np.float64(0.0001670999669916758) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.35716547071933746) - perf/mfu/actor:np.float64(0.21673872957012513) - perf/max_memory_allocated_gb:np.float64(73.38219451904297) - perf/max_memory_reserved_gb:np.float64(77.046875) - perf/cpu_memory_used_gb:np.float64(814.5925140380859) - actor/lr:np.float64(1e-06) - training/global_step:32 - training/epoch:5 - critic/score/mean:-4.672434329986572 - critic/score/max:-3.6663999557495117 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.672434329986572 - critic/rewards/max:-3.6663999557495117 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:0.0025693068746477365 - critic/advantages/max:1.1546980142593384 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0025693068746477365 - critic/returns/max:1.1546980142593384 - critic/returns/min:-1.1546998023986816 - response_length/mean:390.3684997558594 - response_length/max:585.0 - response_length/min:258.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:390.3684997558594 - response_length_non_aborted/max:585.0 - response_length_non_aborted/min:258.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.639374911785126e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.391188690438867) - timing_s/agent_loop/generate_sequences/max:np.float64(39.05870996415615) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.77961843233061) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(39.05870996415615) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:585 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:529.3134188149124 - timing_s/reward:0.00017348304390907288 - timing_s/old_log_prob:53.55814334191382 - timing_s/ref:59.73220798932016 - timing_s/adv:0.10601087100803852 - timing_s/update_actor:279.60596734285355 - timing_s/update_weights:21.066588884219527 - timing_s/step:944.2281676437706 - timing_s/stop_profile:0.0001968834549188614 - timing_per_token_ms/ref:0.04323940264024971 - timing_per_token_ms/update_actor:0.20240328307829825 - timing_per_token_ms/adv:7.673995136057456e-05 - timing_per_token_ms/gen:0.8827687161484582 - perf/total_num_tokens:1381430 - perf/time_per_step:944.2281676437706 - perf/throughput:731.512809794281 +step:33 - global_seqlen/min:694631 - global_seqlen/max:696030 - global_seqlen/minmax_diff:1399 - global_seqlen/balanced_min:695290 - global_seqlen/balanced_max:695371 - global_seqlen/mean:695330.5 - actor/entropy:0.49182015657424927 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00022034942715739528) - actor/kl_loss:np.float64(0.1666806957994898) - actor/pg_clipfrac:np.float64(0.0010168382820362847) - actor/ppo_kl:np.float64(9.323938142339709e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.31018415093421936) - perf/mfu/actor:np.float64(0.1952765083659439) - perf/max_memory_allocated_gb:np.float64(73.73814964294434) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(813.323881149292) - actor/lr:np.float64(1e-06) - training/global_step:33 - training/epoch:5 - critic/score/mean:-4.714134216308594 - critic/score/max:-4.238768577575684 - critic/score/min:-6.654905796051025 - critic/rewards/mean:-4.714134216308594 - critic/rewards/max:-4.238768577575684 - critic/rewards/min:-6.654905796051025 - critic/advantages/mean:0.0009494231780990958 - critic/advantages/max:1.154699444770813 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0009494231780990958 - critic/returns/max:1.154699444770813 - critic/returns/min:-1.1546999216079712 - response_length/mean:396.3782653808594 - response_length/max:838.0 - response_length/min:249.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:396.3782653808594 - response_length_non_aborted/max:838.0 - response_length_non_aborted/min:249.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00011813454329967499 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.332424879074097) - timing_s/agent_loop/generate_sequences/max:np.float64(41.82329605333507) - timing_s/agent_loop/generate_sequences/mean:np.float64(34.440764077055064) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(41.82329605333507) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:838 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:661.5183580927551 - timing_s/reward:0.00022965297102928162 - timing_s/old_log_prob:55.00175532884896 - timing_s/ref:66.16903232596815 - timing_s/adv:0.0966866947710514 - timing_s/update_actor:312.2728597521782 - timing_s/update_weights:18.88545323908329 - timing_s/step:1114.8555752243847 - timing_s/stop_profile:0.00019505247473716736 - timing_per_token_ms/ref:0.04758099373317304 - timing_per_token_ms/update_actor:0.22454995124777224 - timing_per_token_ms/adv:6.952571098999066e-05 - timing_per_token_ms/gen:1.0865278524346502 - perf/total_num_tokens:1390661 - perf/time_per_step:1114.8555752243847 - perf/throughput:623.6955848384687 +step:34 - global_seqlen/min:696172 - global_seqlen/max:697431 - global_seqlen/minmax_diff:1259 - global_seqlen/balanced_min:696801 - global_seqlen/balanced_max:696802 - global_seqlen/mean:696801.5 - actor/entropy:0.48958510160446167 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00015819776611654922) - actor/kl_loss:np.float64(0.177057026575009) - actor/pg_clipfrac:np.float64(0.0008103765042809149) - actor/ppo_kl:np.float64(8.289799789433043e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.31421928107738495) - perf/mfu/actor:np.float64(0.2172208972121894) - perf/max_memory_allocated_gb:np.float64(73.73814964294434) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(806.7108840942383) - actor/lr:np.float64(1e-06) - training/global_step:34 - training/epoch:5 - critic/score/mean:-4.686532974243164 - critic/score/max:-3.9997334480285645 - critic/score/min:-6.654905796051025 - critic/rewards/mean:-4.686532974243164 - critic/rewards/max:-3.9997334480285645 - critic/rewards/min:-6.654905796051025 - critic/advantages/mean:0.0016516846371814609 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0016516846371814609 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:398.2936096191406 - response_length/max:574.0 - response_length/min:250.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:398.2936096191406 - response_length_non_aborted/max:574.0 - response_length_non_aborted/min:250.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.946975529193878e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.989261034876108) - timing_s/agent_loop/generate_sequences/max:np.float64(42.70244946703315) - timing_s/agent_loop/generate_sequences/mean:np.float64(37.41907303235712) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(42.70244946703315) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:574 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:650.1796634513885 - timing_s/reward:0.0002342667430639267 - timing_s/old_log_prob:56.37677332572639 - timing_s/ref:65.2173971850425 - timing_s/adv:0.0898739118129015 - timing_s/update_actor:281.43595359288156 - timing_s/update_weights:29.194567143917084 - timing_s/step:1083.2815842013806 - timing_s/stop_profile:0.0002065952867269516 - timing_per_token_ms/ref:0.046797687135462895 - timing_per_token_ms/update_actor:0.20194844126546913 - timing_per_token_ms/adv:6.449032602032394e-05 - timing_per_token_ms/gen:1.062768848638787 - perf/total_num_tokens:1393603 - perf/time_per_step:1083.2815842013806 - perf/throughput:643.2321108031183 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 35} +validation generation end +step:35 - global_seqlen/min:702636 - global_seqlen/max:703024 - global_seqlen/minmax_diff:388 - global_seqlen/balanced_min:702830 - global_seqlen/balanced_max:702830 - global_seqlen/mean:702830.0 - actor/entropy:0.48366057872772217 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00021059673114602606) - actor/kl_loss:np.float64(0.18742932689686614) - actor/pg_clipfrac:np.float64(0.001221013527053098) - actor/ppo_kl:np.float64(0.00010008383291430316) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.36773815751075745) - perf/mfu/actor:np.float64(0.1705859748999483) - perf/max_memory_allocated_gb:np.float64(73.73814964294434) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(800.1265335083008) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.662470179445603) - val-core/multiclinsum/acc/mean@1:np.float64(-4.662470205593103) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:35 - training/epoch:5 - critic/score/mean:-4.703644275665283 - critic/score/max:-4.257299900054932 - critic/score/min:-6.657225608825684 - critic/rewards/mean:-4.703644275665283 - critic/rewards/max:-4.257299900054932 - critic/rewards/min:-6.657225608825684 - critic/advantages/mean:0.0007636038935743272 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0007636038935743272 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:406.1432189941406 - response_length/max:565.0 - response_length/min:261.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:406.1432189941406 - response_length_non_aborted/max:565.0 - response_length_non_aborted/min:261.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.986758232116699e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.428727066144347) - timing_s/agent_loop/generate_sequences/max:np.float64(41.39781468734145) - timing_s/agent_loop/generate_sequences/mean:np.float64(36.108661144857855) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(41.39781468734145) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:559 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:807.3028340749443 - timing_s/reward:0.00022482313215732574 - timing_s/old_log_prob:55.01183724589646 - timing_s/ref:62.62568815052509 - timing_s/adv:0.10645129904150963 - timing_s/update_actor:361.27541264146566 - timing_s/update_weights:27.657940370962024 - timing_s/step:1314.7662211824208 - timing_s/testing:123.73463488928974 - timing_s/stop_profile:0.00016891397535800934 - timing_per_token_ms/ref:0.044552514939974884 - timing_per_token_ms/update_actor:0.2570147920844768 - timing_per_token_ms/adv:7.573047468200676e-05 - timing_per_token_ms/gen:1.2940946564080051 - perf/total_num_tokens:1405660 - perf/time_per_step:1314.7662211824208 - perf/throughput:534.5665173599588 +step:36 - global_seqlen/min:698005 - global_seqlen/max:699391 - global_seqlen/minmax_diff:1386 - global_seqlen/balanced_min:698698 - global_seqlen/balanced_max:698698 - global_seqlen/mean:698698.0 - actor/entropy:0.4787822961807251 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(3.082866896874856e-05) - actor/kl_loss:np.float64(0.19404008953521648) - actor/pg_clipfrac:np.float64(0.0012146906010457315) - actor/ppo_kl:np.float64(0.0001629093295605344) - actor/pg_clipfrac_lower:np.float64(1.92384641195531e-06) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.361365869641304) - perf/mfu/actor:np.float64(0.17381117911875815) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(800.1292762756348) - actor/lr:np.float64(1e-06) - training/global_step:36 - training/epoch:5 - critic/score/mean:-4.694204807281494 - critic/score/max:-4.211854457855225 - critic/score/min:-6.658824443817139 - critic/rewards/mean:-4.694204807281494 - critic/rewards/max:-4.211854457855225 - critic/rewards/min:-6.658824443817139 - critic/advantages/mean:0.0013898154720664024 - critic/advantages/max:1.154699444770813 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0013898154720664024 - critic/returns/max:1.154699444770813 - critic/returns/min:-1.1546999216079712 - response_length/mean:400.7630310058594 - response_length/max:632.0 - response_length/min:257.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:400.7630310058594 - response_length_non_aborted/max:632.0 - response_length_non_aborted/min:257.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:5.457736551761627e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.120731122791767) - timing_s/agent_loop/generate_sequences/max:np.float64(40.12327888794243) - timing_s/agent_loop/generate_sequences/mean:np.float64(34.781033763427935) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(40.12327888794243) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:616 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:542.8108114264905 - timing_s/reward:0.00032837316393852234 - timing_s/old_log_prob:55.12414244748652 - timing_s/ref:66.57894644886255 - timing_s/adv:0.11767221800982952 - timing_s/update_actor:352.3681481182575 - timing_s/update_weights:28.810284819453955 - timing_s/step:1046.6477100998163 - timing_s/stop_profile:0.00021147355437278748 - timing_per_token_ms/ref:0.047645010039289185 - timing_per_token_ms/update_actor:0.25216055299876167 - timing_per_token_ms/adv:8.42082115662486e-05 - timing_per_token_ms/gen:0.8817990607540476 - perf/total_num_tokens:1397396 - perf/time_per_step:1046.6477100998163 - perf/throughput:667.5579502613796 +step:37 - global_seqlen/min:693319 - global_seqlen/max:694834 - global_seqlen/minmax_diff:1515 - global_seqlen/balanced_min:694075 - global_seqlen/balanced_max:694078 - global_seqlen/mean:694076.5 - actor/entropy:0.4770285487174988 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.001959755412826782) - actor/kl_loss:np.float64(0.20200522243976593) - actor/pg_clipfrac:np.float64(0.0009152766118252961) - actor/ppo_kl:np.float64(-0.00011271919405923352) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.29167333245277405) - perf/mfu/actor:np.float64(0.18156486295184043) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(805.8497200012207) - actor/lr:np.float64(1e-06) - training/global_step:37 - training/epoch:6 - critic/score/mean:-4.697465419769287 - critic/score/max:-4.30276346206665 - critic/score/min:-6.6555304527282715 - critic/rewards/mean:-4.697465419769287 - critic/rewards/max:-4.30276346206665 - critic/rewards/min:-6.6555304527282715 - critic/advantages/mean:0.00278259115293622 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.00278259115293622 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:394.7454528808594 - response_length/max:726.0 - response_length/min:253.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:394.7454528808594 - response_length_non_aborted/max:726.0 - response_length_non_aborted/min:253.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0004978906363248825 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(24.105433033779263) - timing_s/agent_loop/generate_sequences/max:np.float64(41.63586726784706) - timing_s/agent_loop/generate_sequences/mean:np.float64(35.66366991703762) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(41.63586726784706) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:726 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:517.3663522675633 - timing_s/reward:0.0001926850527524948 - timing_s/old_log_prob:54.81703631952405 - timing_s/ref:65.71737437136471 - timing_s/adv:0.12063013948500156 - timing_s/update_actor:335.3142611067742 - timing_s/update_weights:39.646684465929866 - timing_s/step:1013.8227445445955 - timing_s/stop_profile:0.0002222880721092224 - timing_per_token_ms/ref:0.047341593017026735 - timing_per_token_ms/update_actor:0.2415542531023412 - timing_per_token_ms/adv:8.689974338923848e-05 - timing_per_token_ms/gen:0.8532766076957614 - perf/total_num_tokens:1388153 - perf/time_per_step:1013.8227445445955 - perf/throughput:684.6132657161642 +step:38 - global_seqlen/min:687734 - global_seqlen/max:689095 - global_seqlen/minmax_diff:1361 - global_seqlen/balanced_min:688414 - global_seqlen/balanced_max:688415 - global_seqlen/mean:688414.5 - actor/entropy:0.47105807065963745 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0002527547379334791) - actor/kl_loss:np.float64(0.20316848469277224) - actor/pg_clipfrac:np.float64(0.0008808161607400203) - actor/ppo_kl:np.float64(7.119112115105963e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.288836732506752) - perf/mfu/actor:np.float64(0.17870267527956274) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(806.1152515411377) - actor/lr:np.float64(1e-06) - training/global_step:38 - training/epoch:6 - critic/score/mean:-4.681267738342285 - critic/score/max:0.0 - critic/score/min:-6.657309055328369 - critic/rewards/mean:-4.681267738342285 - critic/rewards/max:0.0 - critic/rewards/min:-6.657309055328369 - critic/advantages/mean:0.004398306366056204 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.004398306366056204 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546999216079712 - response_length/mean:387.373046875 - response_length/max:623.0 - response_length/min:257.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:387.373046875 - response_length_non_aborted/max:623.0 - response_length_non_aborted/min:257.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.58003443479538e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.30347441136837) - timing_s/agent_loop/generate_sequences/max:np.float64(39.07290055602789) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.86962571870148) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(39.07290055602789) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:614 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:536.3558603450656 - timing_s/reward:0.00017476454377174377 - timing_s/old_log_prob:58.37003836221993 - timing_s/ref:66.97567273676395 - timing_s/adv:0.11183743923902512 - timing_s/update_actor:337.8109574858099 - timing_s/update_weights:32.53661219775677 - timing_s/step:1032.9877207539976 - timing_s/stop_profile:0.00019940175116062164 - timing_per_token_ms/ref:0.048644873645720677 - timing_per_token_ms/update_actor:0.24535433048389446 - timing_per_token_ms/adv:8.122827107725441e-05 - timing_per_token_ms/gen:0.9014308456988859 - perf/total_num_tokens:1376829 - perf/time_per_step:1032.9877207539976 - perf/throughput:666.4304775060763 +step:39 - global_seqlen/min:685784 - global_seqlen/max:686887 - global_seqlen/minmax_diff:1103 - global_seqlen/balanced_min:686335 - global_seqlen/balanced_max:686336 - global_seqlen/mean:686335.5 - actor/entropy:0.4654565751552582 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-9.688669039557406e-05) - actor/kl_loss:np.float64(0.21460302112003168) - actor/pg_clipfrac:np.float64(0.0011479221066110767) - actor/ppo_kl:np.float64(0.0001054730849621895) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3527892529964447) - perf/mfu/actor:np.float64(0.1847871783779826) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(805.6065979003906) - actor/lr:np.float64(1e-06) - training/global_step:39 - training/epoch:6 - critic/score/mean:-4.687253475189209 - critic/score/max:0.0 - critic/score/min:-6.657628059387207 - critic/rewards/mean:-4.687253475189209 - critic/rewards/max:0.0 - critic/rewards/min:-6.657628059387207 - critic/advantages/mean:0.004890956450253725 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.004890956450253725 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.1546999216079712 - response_length/mean:384.666015625 - response_length/max:596.0 - response_length/min:249.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:384.666015625 - response_length_non_aborted/max:596.0 - response_length_non_aborted/min:249.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.86102968454361e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(24.65323754027486) - timing_s/agent_loop/generate_sequences/max:np.float64(40.717685882002115) - timing_s/agent_loop/generate_sequences/mean:np.float64(35.668440115904865) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(40.717685882002115) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:596 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:660.2091679424047 - timing_s/reward:0.00029441528022289276 - timing_s/old_log_prob:55.86141720041633 - timing_s/ref:65.52164687588811 - timing_s/adv:0.12104113027453423 - timing_s/update_actor:325.84477931819856 - timing_s/update_weights:31.7697462644428 - timing_s/step:1140.14723235555 - timing_s/stop_profile:0.00025328435003757477 - timing_per_token_ms/ref:0.04773295777057147 - timing_per_token_ms/update_actor:0.23738010005179577 - timing_per_token_ms/adv:8.817927258209303e-05 - timing_per_token_ms/gen:1.1173944658133235 - perf/total_num_tokens:1372671 - perf/time_per_step:1140.14723235555 - perf/throughput:601.9709389479702 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 40} +validation generation end +local_global_step_folder: /home/mshahidul/readctrl/code/RL_model/RL_model/global_step_40 +Warning: remove_previous_ckpt_in_save is deprecated, set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead +step:40 - global_seqlen/min:688447 - global_seqlen/max:689502 - global_seqlen/minmax_diff:1055 - global_seqlen/balanced_min:688974 - global_seqlen/balanced_max:688975 - global_seqlen/mean:688974.5 - actor/entropy:0.4550885856151581 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00016928357945289463) - actor/kl_loss:np.float64(0.2236585508411129) - actor/pg_clipfrac:np.float64(0.001247902541460159) - actor/ppo_kl:np.float64(6.75129241625901e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.42527078092098236) - perf/mfu/actor:np.float64(0.22972546503566835) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(797.5219078063965) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.716530487116645) - val-core/multiclinsum/acc/mean@1:np.float64(-4.7165305176488275) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:40 - training/epoch:6 - critic/score/mean:-4.703845024108887 - critic/score/max:-4.166399955749512 - critic/score/min:-6.659821033477783 - critic/rewards/mean:-4.703845024108887 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.659821033477783 - critic/advantages/mean:0.000375720439478755 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.000375720439478755 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:388.1022033691406 - response_length/max:590.0 - response_length/min:234.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:388.1022033691406 - response_length_non_aborted/max:590.0 - response_length_non_aborted/min:234.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.80150294303894e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.05880691483617) - timing_s/agent_loop/generate_sequences/max:np.float64(40.44416379183531) - timing_s/agent_loop/generate_sequences/mean:np.float64(34.97532783574934) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(40.44416379183531) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:590 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:564.1583431009203 - timing_s/reward:0.00017052702605724335 - timing_s/old_log_prob:53.23983366973698 - timing_s/ref:58.78972235508263 - timing_s/adv:0.10404612123966217 - timing_s/update_actor:262.9881455041468 - timing_s/update_weights:21.853313287720084 - timing_s/step:961.9591401461512 - timing_s/testing:134.2620944492519 - timing_s/save_checkpoint:151.70704928599298 - timing_s/stop_profile:0.0003153383731842041 - timing_per_token_ms/ref:0.04266465765792684 - timing_per_token_ms/update_actor:0.19085477438145157 - timing_per_token_ms/adv:7.550796236991512e-05 - timing_per_token_ms/gen:0.9463759162942676 - perf/total_num_tokens:1377949 - perf/time_per_step:961.9591401461512 - perf/throughput:716.2201295736154 +step:41 - global_seqlen/min:684520 - global_seqlen/max:685934 - global_seqlen/minmax_diff:1414 - global_seqlen/balanced_min:685227 - global_seqlen/balanced_max:685227 - global_seqlen/mean:685227.0 - actor/entropy:0.4535427987575531 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0003927030459938835) - actor/kl_loss:np.float64(0.2297459468245506) - actor/pg_clipfrac:np.float64(0.0013733001226986137) - actor/ppo_kl:np.float64(0.00030175010922069606) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.37812529504299164) - perf/mfu/actor:np.float64(0.24181308993127892) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(813.3420925140381) - actor/lr:np.float64(1e-06) - training/global_step:41 - training/epoch:6 - critic/score/mean:-4.704458236694336 - critic/score/max:-3.9997334480285645 - critic/score/min:-6.6577043533325195 - critic/rewards/mean:-4.704458236694336 - critic/rewards/max:-3.9997334480285645 - critic/rewards/min:-6.6577043533325195 - critic/advantages/mean:0.004873715806752443 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.004873715806752443 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:383.22265625 - response_length/max:601.0 - response_length/min:257.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:383.22265625 - response_length_non_aborted/max:601.0 - response_length_non_aborted/min:257.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.498023867607117e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.4523613024503) - timing_s/agent_loop/generate_sequences/max:np.float64(38.04885157942772) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.89887848759705) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(38.04885157942772) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:578 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:549.3878342211246 - timing_s/reward:0.0002708360552787781 - timing_s/old_log_prob:52.85748570039868 - timing_s/ref:59.84916624613106 - timing_s/adv:0.11363416910171509 - timing_s/update_actor:248.6259426921606 - timing_s/update_weights:22.213924165815115 - timing_s/step:933.917120475322 - timing_s/stop_profile:0.00015847384929656982 - timing_per_token_ms/ref:0.04367105079494172 - timing_per_token_ms/update_actor:0.18141867052244046 - timing_per_token_ms/adv:8.291717131820191e-05 - timing_per_token_ms/gen:0.9333330516982224 - perf/total_num_tokens:1370454 - perf/time_per_step:933.917120475322 - perf/throughput:733.7128584292899 +step:42 - global_seqlen/min:686694 - global_seqlen/max:687179 - global_seqlen/minmax_diff:485 - global_seqlen/balanced_min:686936 - global_seqlen/balanced_max:686937 - global_seqlen/mean:686936.5 - actor/entropy:0.44994065165519714 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0005480100420148341) - actor/kl_loss:np.float64(0.23187385406345126) - actor/pg_clipfrac:np.float64(0.0010840108164605529) - actor/ppo_kl:np.float64(0.0001421749686490633) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3514629453420639) - perf/mfu/actor:np.float64(0.2272878971421513) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(794.9776344299316) - actor/lr:np.float64(1e-06) - training/global_step:42 - training/epoch:6 - critic/score/mean:-4.680936336517334 - critic/score/max:-3.9612717628479004 - critic/score/min:-6.658269882202148 - critic/rewards/mean:-4.680936336517334 - critic/rewards/max:-3.9612717628479004 - critic/rewards/min:-6.658269882202148 - critic/advantages/mean:0.002963840728625655 - critic/advantages/max:1.154691457748413 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.002963840728625655 - critic/returns/max:1.154691457748413 - critic/returns/min:-1.1546998023986816 - response_length/mean:385.4485778808594 - response_length/max:580.0 - response_length/min:256.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:385.4485778808594 - response_length_non_aborted/max:580.0 - response_length_non_aborted/min:256.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.151734411716461e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.80479716323316) - timing_s/agent_loop/generate_sequences/max:np.float64(38.96061837673187) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.88973106311338) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(38.96061837673187) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:580 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:572.6395308449864 - timing_s/reward:0.0003359392285346985 - timing_s/old_log_prob:53.482663767412305 - timing_s/ref:59.44303357973695 - timing_s/adv:0.11620891280472279 - timing_s/update_actor:265.07874103449285 - timing_s/update_weights:22.254323860630393 - timing_s/step:973.9199075419456 - timing_s/stop_profile:0.00018562190234661102 - timing_per_token_ms/ref:0.043266760158862536 - timing_per_token_ms/update_actor:0.19294268177225468 - timing_per_token_ms/adv:8.458490181022757e-05 - timing_per_token_ms/gen:0.9672164480389063 - perf/total_num_tokens:1373873 - perf/time_per_step:973.9199075419456 - perf/throughput:705.3316136988549 +step:43 - global_seqlen/min:685128 - global_seqlen/max:686220 - global_seqlen/minmax_diff:1092 - global_seqlen/balanced_min:685674 - global_seqlen/balanced_max:685674 - global_seqlen/mean:685674.0 - actor/entropy:0.4460827708244324 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0010252818030570205) - actor/kl_loss:np.float64(0.2424181249613563) - actor/pg_clipfrac:np.float64(0.001012976813702456) - actor/ppo_kl:np.float64(0.00010447168263757096) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3281548023223877) - perf/mfu/actor:np.float64(0.2124275386751046) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(796.5147857666016) - actor/lr:np.float64(1e-06) - training/global_step:43 - training/epoch:7 - critic/score/mean:-4.676591396331787 - critic/score/max:0.0 - critic/score/min:-6.655036449432373 - critic/rewards/mean:-4.676591396331787 - critic/rewards/max:0.0 - critic/rewards/min:-6.655036449432373 - critic/advantages/mean:0.003880059579387307 - critic/advantages/max:1.154699444770813 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.003880059579387307 - critic/returns/max:1.154699444770813 - critic/returns/min:-1.1546998023986816 - response_length/mean:383.8046875 - response_length/max:597.0 - response_length/min:247.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:383.8046875 - response_length_non_aborted/max:597.0 - response_length_non_aborted/min:247.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00036179088056087494 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(21.797359116375446) - timing_s/agent_loop/generate_sequences/max:np.float64(37.623481933027506) - timing_s/agent_loop/generate_sequences/mean:np.float64(32.887034635725286) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(37.623481933027506) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:597 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:540.0616846513003 - timing_s/reward:0.00019257515668869019 - timing_s/old_log_prob:53.545377960428596 - timing_s/ref:60.018699530512094 - timing_s/adv:0.10529512539505959 - timing_s/update_actor:283.0155962482095 - timing_s/update_weights:23.376380365341902 - timing_s/step:960.9645128250122 - timing_s/stop_profile:0.00020590797066688538 - timing_per_token_ms/ref:0.04376620633895415 - timing_per_token_ms/update_actor:0.20637766361872367 - timing_per_token_ms/adv:7.67822065551994e-05 - timing_per_token_ms/gen:0.9160978766789822 - perf/total_num_tokens:1371348 - perf/time_per_step:960.9645128250122 - perf/throughput:713.5268689415782 +step:44 - global_seqlen/min:684901 - global_seqlen/max:685621 - global_seqlen/minmax_diff:720 - global_seqlen/balanced_min:685261 - global_seqlen/balanced_max:685261 - global_seqlen/mean:685261.0 - actor/entropy:0.4436521828174591 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0002091425897864012) - actor/kl_loss:np.float64(0.25151689536869526) - actor/pg_clipfrac:np.float64(0.0010630178658175282) - actor/ppo_kl:np.float64(4.38429009363972e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.31864093244075775) - perf/mfu/actor:np.float64(0.23748352217400284) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(796.471622467041) - actor/lr:np.float64(1e-06) - training/global_step:44 - training/epoch:7 - critic/score/mean:-4.7049336433410645 - critic/score/max:-3.7316999435424805 - critic/score/min:-6.6583356857299805 - critic/rewards/mean:-4.7049336433410645 - critic/rewards/max:-3.7316999435424805 - critic/rewards/min:-6.6583356857299805 - critic/advantages/mean:0.0029362065251916647 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0029362065251916647 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.1546998023986816 - response_length/mean:383.2669372558594 - response_length/max:584.0 - response_length/min:248.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:383.2669372558594 - response_length_non_aborted/max:584.0 - response_length_non_aborted/min:248.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00010156072676181793 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.07578019425273) - timing_s/agent_loop/generate_sequences/max:np.float64(38.43459939211607) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.35813142585539) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(38.43459939211607) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:584 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:538.0585689730942 - timing_s/reward:0.00015261396765708923 - timing_s/old_log_prob:53.28497677668929 - timing_s/ref:58.27342115342617 - timing_s/adv:0.0811269786208868 - timing_s/update_actor:253.1274349540472 - timing_s/update_weights:23.57608055882156 - timing_s/step:927.3327898737043 - timing_s/stop_profile:0.00020982883870601654 - timing_per_token_ms/ref:0.0425191431829815 - timing_per_token_ms/update_actor:0.18469417853492845 - timing_per_token_ms/adv:5.919421842253302e-05 - timing_per_token_ms/gen:0.9139806300906309 - perf/total_num_tokens:1370522 - perf/time_per_step:927.3327898737043 - perf/throughput:738.9590958962287 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 45} +validation generation end +step:45 - global_seqlen/min:682861 - global_seqlen/max:686732 - global_seqlen/minmax_diff:3871 - global_seqlen/balanced_min:684796 - global_seqlen/balanced_max:684797 - global_seqlen/mean:684796.5 - actor/entropy:0.44099748134613037 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00018637157093811564) - actor/kl_loss:np.float64(0.2586772491534551) - actor/pg_clipfrac:np.float64(0.0010361139781404443) - actor/ppo_kl:np.float64(0.00011965021190007974) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3519202768802643) - perf/mfu/actor:np.float64(0.23779154871645744) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(797.1557426452637) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.740546636020436) - val-core/multiclinsum/acc/mean@1:np.float64(-4.740546653982495) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:45 - training/epoch:7 - critic/score/mean:-4.703165531158447 - critic/score/max:-4.166399955749512 - critic/score/min:-6.657471656799316 - critic/rewards/mean:-4.703165531158447 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.657471656799316 - critic/advantages/mean:0.0021458857227116823 - critic/advantages/max:1.1546993255615234 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0021458857227116823 - critic/returns/max:1.1546993255615234 - critic/returns/min:-1.1546999216079712 - response_length/mean:382.662109375 - response_length/max:596.0 - response_length/min:259.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:382.662109375 - response_length_non_aborted/max:596.0 - response_length_non_aborted/min:259.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.393715739250183e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.336656583473086) - timing_s/agent_loop/generate_sequences/max:np.float64(39.1468316540122) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.833960688850006) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(39.1468316540122) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:560 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:512.3853424545377 - timing_s/reward:0.00020130351185798645 - timing_s/old_log_prob:53.061512406915426 - timing_s/ref:57.513591822236776 - timing_s/adv:0.10628525167703629 - timing_s/update_actor:252.60829523205757 - timing_s/update_weights:19.887087220326066 - timing_s/step:896.3448001146317 - timing_s/testing:118.47730234451592 - timing_s/stop_profile:0.00022782199084758759 - timing_per_token_ms/ref:0.04199319930974879 - timing_per_token_ms/update_actor:0.18444041056872923 - timing_per_token_ms/adv:7.760353015606555e-05 - timing_per_token_ms/gen:0.871746115318327 - perf/total_num_tokens:1369593 - perf/time_per_step:896.3448001146317 - perf/throughput:763.9878090578791 +step:46 - global_seqlen/min:686478 - global_seqlen/max:689837 - global_seqlen/minmax_diff:3359 - global_seqlen/balanced_min:688157 - global_seqlen/balanced_max:688158 - global_seqlen/mean:688157.5 - actor/entropy:0.4402969181537628 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(5.774886813014399e-05) - actor/kl_loss:np.float64(0.2643445969248811) - actor/pg_clipfrac:np.float64(0.0013825777229309704) - actor/ppo_kl:np.float64(0.00017153578536029576) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3932368606328964) - perf/mfu/actor:np.float64(0.23256321783252698) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(798.8592338562012) - actor/lr:np.float64(1e-06) - training/global_step:46 - training/epoch:7 - critic/score/mean:-4.70769739151001 - critic/score/max:-3.9235427379608154 - critic/score/min:-6.657140731811523 - critic/rewards/mean:-4.70769739151001 - critic/rewards/max:-3.9235427379608154 - critic/rewards/min:-6.657140731811523 - critic/advantages/mean:0.006182671524584293 - critic/advantages/max:1.1547000408172607 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.006182671524584293 - critic/returns/max:1.1547000408172607 - critic/returns/min:-1.1546998023986816 - response_length/mean:387.0384216308594 - response_length/max:584.0 - response_length/min:262.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:387.0384216308594 - response_length_non_aborted/max:584.0 - response_length_non_aborted/min:262.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0001260414719581604 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.293215420097113) - timing_s/agent_loop/generate_sequences/max:np.float64(38.09784034267068) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.28214227034065) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(38.09784034267068) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:584 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:963.6050769537687 - timing_s/reward:0.00018410570919513702 - timing_s/old_log_prob:53.312312446534634 - timing_s/ref:58.7852944470942 - timing_s/adv:0.12232540175318718 - timing_s/update_actor:259.56439612433314 - timing_s/update_weights:21.588204795494676 - timing_s/step:1357.818133143708 - timing_s/stop_profile:0.00020798109471797943 - timing_per_token_ms/ref:0.04271209312337234 - timing_per_token_ms/update_actor:0.18859374207527574 - timing_per_token_ms/adv:8.887892797302012e-05 - timing_per_token_ms/gen:1.6208909419213557 - perf/total_num_tokens:1376315 - perf/time_per_step:1357.818133143708 - perf/throughput:506.8112460736795 +step:47 - global_seqlen/min:691557 - global_seqlen/max:694570 - global_seqlen/minmax_diff:3013 - global_seqlen/balanced_min:693063 - global_seqlen/balanced_max:693064 - global_seqlen/mean:693063.5 - actor/entropy:0.43533390760421753 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(7.9585908679289e-05) - actor/kl_loss:np.float64(0.27385216175268096) - actor/pg_clipfrac:np.float64(0.0016702957727829926) - actor/ppo_kl:np.float64(0.0004301466785060863) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.4420125037431717) - perf/mfu/actor:np.float64(0.21566376024495382) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(795.3225288391113) - actor/lr:np.float64(1e-06) - training/global_step:47 - training/epoch:7 - critic/score/mean:-4.682229518890381 - critic/score/max:0.0 - critic/score/min:-6.656784534454346 - critic/rewards/mean:-4.682229518890381 - critic/rewards/max:0.0 - critic/rewards/min:-6.656784534454346 - critic/advantages/mean:0.0037721386179327965 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0037721386179327965 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:393.4264221191406 - response_length/max:615.0 - response_length/min:264.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:393.4264221191406 - response_length_non_aborted/max:615.0 - response_length_non_aborted/min:264.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.95250928401947e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(24.40025358647108) - timing_s/agent_loop/generate_sequences/max:np.float64(40.59783633425832) - timing_s/agent_loop/generate_sequences/mean:np.float64(35.24475956597113) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(40.59783633425832) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:615 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:519.379565987736 - timing_s/reward:0.00024126656353473663 - timing_s/old_log_prob:54.308500573039055 - timing_s/ref:61.48584991320968 - timing_s/adv:0.11136819049715996 - timing_s/update_actor:281.891505099833 - timing_s/update_weights:26.64723516255617 - timing_s/step:944.6533793788403 - timing_s/stop_profile:0.00022842735052108765 - timing_per_token_ms/ref:0.04435802052280179 - timing_per_token_ms/update_actor:0.20336628974100715 - timing_per_token_ms/adv:8.034486774816446e-05 - timing_per_token_ms/gen:0.859468786333571 - perf/total_num_tokens:1386127 - perf/time_per_step:944.6533793788403 - perf/throughput:733.6696349466573 +step:48 - global_seqlen/min:692475 - global_seqlen/max:695549 - global_seqlen/minmax_diff:3074 - global_seqlen/balanced_min:694012 - global_seqlen/balanced_max:694012 - global_seqlen/mean:694012.0 - actor/entropy:0.4345180094242096 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00016317271850615047) - actor/kl_loss:np.float64(0.2740267654880881) - actor/pg_clipfrac:np.float64(0.0009678966356053328) - actor/ppo_kl:np.float64(9.294387973568519e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3233174830675125) - perf/mfu/actor:np.float64(0.21382624693127278) - perf/max_memory_allocated_gb:np.float64(73.96442461013794) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(795.3202896118164) - actor/lr:np.float64(1e-06) - training/global_step:48 - training/epoch:7 - critic/score/mean:-4.706887245178223 - critic/score/max:-3.794605016708374 - critic/score/min:-6.657225608825684 - critic/rewards/mean:-4.706887245178223 - critic/rewards/max:-3.794605016708374 - critic/rewards/min:-6.657225608825684 - critic/advantages/mean:0.00515801040455699 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.00515801040455699 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:394.6614685058594 - response_length/max:609.0 - response_length/min:262.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:394.6614685058594 - response_length_non_aborted/max:609.0 - response_length_non_aborted/min:262.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00010688789188861847 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(26.071544943377376) - timing_s/agent_loop/generate_sequences/max:np.float64(42.66504089534283) - timing_s/agent_loop/generate_sequences/mean:np.float64(37.51373488619114) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(42.66504089534283) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:582 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:545.7739055678248 - timing_s/reward:0.00017266720533370972 - timing_s/old_log_prob:54.30322212725878 - timing_s/ref:61.29109136760235 - timing_s/adv:0.11960364133119583 - timing_s/update_actor:284.67682934366167 - timing_s/update_weights:25.087926844134927 - timing_s/step:972.085301047191 - timing_s/stop_profile:0.00020149536430835724 - timing_per_token_ms/ref:0.04415708328357604 - timing_per_token_ms/update_actor:0.20509503390695094 - timing_per_token_ms/adv:8.616828047007533e-05 - timing_per_token_ms/gen:0.9003198706166692 - perf/total_num_tokens:1388024 - perf/time_per_step:972.085301047191 - perf/throughput:713.9414609524152 +step:49 - global_seqlen/min:700765 - global_seqlen/max:702639 - global_seqlen/minmax_diff:1874 - global_seqlen/balanced_min:701702 - global_seqlen/balanced_max:701702 - global_seqlen/mean:701702.0 - actor/entropy:0.4305967390537262 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00028688112312617517) - actor/kl_loss:np.float64(0.2799825742840767) - actor/pg_clipfrac:np.float64(0.0014262865831066545) - actor/ppo_kl:np.float64(0.00017909571128408666) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.4899856746196747) - perf/mfu/actor:np.float64(0.18371401600013793) - perf/max_memory_allocated_gb:np.float64(73.97368860244751) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(794.8248329162598) - actor/lr:np.float64(1e-06) - training/global_step:49 - training/epoch:8 - critic/score/mean:-4.706087589263916 - critic/score/max:-3.9997334480285645 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.706087589263916 - critic/rewards/max:-3.9997334480285645 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:0.004243197850883007 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.004243197850883007 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:404.6744689941406 - response_length/max:586.0 - response_length/min:258.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:404.6744689941406 - response_length_non_aborted/max:586.0 - response_length_non_aborted/min:258.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00038223154842853546 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(24.225112522020936) - timing_s/agent_loop/generate_sequences/max:np.float64(42.66993950866163) - timing_s/agent_loop/generate_sequences/mean:np.float64(37.45695454301798) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(42.66993950866163) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:578 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:675.7836391851306 - timing_s/reward:0.0002264808863401413 - timing_s/old_log_prob:55.09367262199521 - timing_s/ref:62.05543091520667 - timing_s/adv:0.12431935593485832 - timing_s/update_actor:335.07023707591 - timing_s/update_weights:27.44576799683273 - timing_s/step:1156.3825458418578 - timing_s/stop_profile:0.0002916380763053894 - timing_per_token_ms/ref:0.04421779538551028 - timing_per_token_ms/update_actor:0.23875536700473277 - timing_per_token_ms/adv:8.858415391067598e-05 - timing_per_token_ms/gen:1.087202997498521 - perf/total_num_tokens:1403404 - perf/time_per_step:1156.3825458418578 - perf/throughput:606.807844448356 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 50} +validation generation end +step:50 - global_seqlen/min:706378 - global_seqlen/max:706455 - global_seqlen/minmax_diff:77 - global_seqlen/balanced_min:706416 - global_seqlen/balanced_max:706417 - global_seqlen/mean:706416.5 - actor/entropy:0.43580639362335205 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00024110255859947569) - actor/kl_loss:np.float64(0.2841534996405244) - actor/pg_clipfrac:np.float64(0.0011718522534162428) - actor/ppo_kl:np.float64(0.000218989194953186) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.36724187433719635) - perf/mfu/actor:np.float64(0.1804877511517157) - perf/max_memory_allocated_gb:np.float64(74.2978835105896) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(789.6419162750244) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.727995289073271) - val-core/multiclinsum/acc/mean@1:np.float64(-4.727995311069559) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:50 - training/epoch:8 - critic/score/mean:-4.694897651672363 - critic/score/max:0.0 - critic/score/min:-6.659550666809082 - critic/rewards/mean:-4.694897651672363 - critic/rewards/max:0.0 - critic/rewards/min:-6.659550666809082 - critic/advantages/mean:0.0029126941226422787 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0029126941226422787 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.1546999216079712 - response_length/mean:410.8131408691406 - response_length/max:612.0 - response_length/min:264.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:410.8131408691406 - response_length_non_aborted/max:612.0 - response_length_non_aborted/min:264.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.388662874698639e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(24.955151222646236) - timing_s/agent_loop/generate_sequences/max:np.float64(43.75764816813171) - timing_s/agent_loop/generate_sequences/mean:np.float64(37.54037082499902) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(43.75764816813171) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:601 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:561.7204908505082 - timing_s/reward:0.0004516970366239548 - timing_s/old_log_prob:55.61285909824073 - timing_s/ref:63.53428631089628 - timing_s/adv:0.12996750324964523 - timing_s/update_actor:343.360773736611 - timing_s/update_weights:22.84923947416246 - timing_s/step:1048.1745034605265 - timing_s/testing:122.73416507989168 - timing_s/stop_profile:0.0002567749470472336 - timing_per_token_ms/ref:0.044969424065615875 - timing_per_token_ms/update_actor:0.24302997858671974 - timing_per_token_ms/adv:9.199070466901978e-05 - timing_per_token_ms/gen:0.890194103175245 - perf/total_num_tokens:1412833 - perf/time_per_step:1048.1745034605265 - perf/throughput:673.949325868718 +step:51 - global_seqlen/min:699657 - global_seqlen/max:700835 - global_seqlen/minmax_diff:1178 - global_seqlen/balanced_min:700246 - global_seqlen/balanced_max:700246 - global_seqlen/mean:700246.0 - actor/entropy:0.43067172169685364 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0010369775457850956) - actor/kl_loss:np.float64(0.2928173585484426) - actor/pg_clipfrac:np.float64(0.0011770332979115967) - actor/ppo_kl:np.float64(0.0002102083916497577) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.34390898048877716) - perf/mfu/actor:np.float64(0.19424467793648817) - perf/max_memory_allocated_gb:np.float64(74.2978835105896) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(788.5504722595215) - actor/lr:np.float64(1e-06) - training/global_step:51 - training/epoch:8 - critic/score/mean:-4.712602615356445 - critic/score/max:-3.6137683391571045 - critic/score/min:-6.658881187438965 - critic/rewards/mean:-4.712602615356445 - critic/rewards/max:-3.6137683391571045 - critic/rewards/min:-6.658881187438965 - critic/advantages/mean:0.0027785601560026407 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0027785601560026407 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.1546998023986816 - response_length/mean:402.7786560058594 - response_length/max:625.0 - response_length/min:253.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:402.7786560058594 - response_length_non_aborted/max:625.0 - response_length_non_aborted/min:253.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:5.999021232128143e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.150692572817206) - timing_s/agent_loop/generate_sequences/max:np.float64(39.220184832811356) - timing_s/agent_loop/generate_sequences/mean:np.float64(33.982375266571275) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(39.220184832811356) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:625 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:515.739121614024 - timing_s/reward:0.00020303577184677124 - timing_s/old_log_prob:54.480300610885024 - timing_s/ref:61.62232657894492 - timing_s/adv:0.12080245837569237 - timing_s/update_actor:316.1892445553094 - timing_s/update_weights:22.553211841732264 - timing_s/step:971.5128652490675 - timing_s/stop_profile:0.00021525472402572632 - timing_per_token_ms/ref:0.04400048452896905 - timing_per_token_ms/update_actor:0.22577011832649485 - timing_per_token_ms/adv:8.625715703887803e-05 - timing_per_token_ms/gen:0.8336282490997176 - perf/total_num_tokens:1400492 - perf/time_per_step:971.5128652490675 - perf/throughput:720.7789264020475 +step:52 - global_seqlen/min:698518 - global_seqlen/max:701279 - global_seqlen/minmax_diff:2761 - global_seqlen/balanced_min:699898 - global_seqlen/balanced_max:699899 - global_seqlen/mean:699898.5 - actor/entropy:0.4240064024925232 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00027717393156005037) - actor/kl_loss:np.float64(0.2940859394147992) - actor/pg_clipfrac:np.float64(0.0011057641786464956) - actor/ppo_kl:np.float64(2.7802586449373243e-06) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3187822848558426) - perf/mfu/actor:np.float64(0.19553939527044767) - perf/max_memory_allocated_gb:np.float64(74.2978835105896) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(792.7732048034668) - actor/lr:np.float64(1e-06) - training/global_step:52 - training/epoch:8 - critic/score/mean:-4.715025424957275 - critic/score/max:-4.110844612121582 - critic/score/min:-6.654635429382324 - critic/rewards/mean:-4.715025424957275 - critic/rewards/max:-4.110844612121582 - critic/rewards/min:-6.654635429382324 - critic/advantages/mean:0.006510026752948761 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.006510026752948761 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:402.326171875 - response_length/max:640.0 - response_length/min:251.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:402.326171875 - response_length_non_aborted/max:640.0 - response_length_non_aborted/min:251.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:8.032098412513733e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.82512247376144) - timing_s/agent_loop/generate_sequences/max:np.float64(40.73749357834458) - timing_s/agent_loop/generate_sequences/mean:np.float64(35.09451940798075) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(40.73749357834458) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:638 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:546.9057796094567 - timing_s/reward:0.0001736115664243698 - timing_s/old_log_prob:54.14199877157807 - timing_s/ref:60.470080299302936 - timing_s/adv:0.1125433649867773 - timing_s/update_actor:313.94184329360723 - timing_s/update_weights:21.55270695872605 - timing_s/step:997.9207843523473 - timing_s/stop_profile:0.0001853015273809433 - timing_per_token_ms/ref:0.04319917838036725 - timing_per_token_ms/update_actor:0.22427669390176377 - timing_per_token_ms/adv:8.039977581519128e-05 - timing_per_token_ms/gen:0.8849994734550808 - perf/total_num_tokens:1399797 - perf/time_per_step:997.9207843523473 - perf/throughput:701.3567719748773 +step:53 - global_seqlen/min:698290 - global_seqlen/max:698909 - global_seqlen/minmax_diff:619 - global_seqlen/balanced_min:698599 - global_seqlen/balanced_max:698600 - global_seqlen/mean:698599.5 - actor/entropy:0.42512115836143494 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0007808263568828445) - actor/kl_loss:np.float64(0.3062715275834005) - actor/pg_clipfrac:np.float64(0.0009450297050837738) - actor/ppo_kl:np.float64(0.00015670022268921002) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2952447980642319) - perf/mfu/actor:np.float64(0.2126779692338128) - perf/max_memory_allocated_gb:np.float64(74.2978835105896) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(812.7021160125732) - actor/lr:np.float64(1e-06) - training/global_step:53 - training/epoch:8 - critic/score/mean:-4.712332725524902 - critic/score/max:-4.19020938873291 - critic/score/min:-6.657628059387207 - critic/rewards/mean:-4.712332725524902 - critic/rewards/max:-4.19020938873291 - critic/rewards/min:-6.657628059387207 - critic/advantages/mean:0.0084008714184165 - critic/advantages/max:1.154699444770813 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0084008714184165 - critic/returns/max:1.154699444770813 - critic/returns/min:-1.1546999216079712 - response_length/mean:400.634765625 - response_length/max:657.0 - response_length/min:253.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:400.634765625 - response_length_non_aborted/max:657.0 - response_length_non_aborted/min:253.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.330572068691254e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(21.95623474381864) - timing_s/agent_loop/generate_sequences/max:np.float64(40.06147608347237) - timing_s/agent_loop/generate_sequences/mean:np.float64(34.538855444272485) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(40.06147608347237) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:657 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:532.4186565820128 - timing_s/reward:0.00020377524197101593 - timing_s/old_log_prob:53.558532582595944 - timing_s/ref:60.32291517406702 - timing_s/adv:0.11865415424108505 - timing_s/update_actor:288.23632340319455 - timing_s/update_weights:23.56522866152227 - timing_s/step:959.1059520319104 - timing_s/stop_profile:0.00021587684750556946 - timing_per_token_ms/ref:0.04317417574308815 - timing_per_token_ms/update_actor:0.20629582715360842 - timing_per_token_ms/adv:8.492287372169967e-05 - timing_per_token_ms/gen:0.865193835599452 - perf/total_num_tokens:1397199 - perf/time_per_step:959.1059520319104 - perf/throughput:728.3861585052043 +step:54 - global_seqlen/min:699330 - global_seqlen/max:701696 - global_seqlen/minmax_diff:2366 - global_seqlen/balanced_min:700513 - global_seqlen/balanced_max:700513 - global_seqlen/mean:700513.0 - actor/entropy:0.420221745967865 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00013568584108725017) - actor/kl_loss:np.float64(0.3110739445934693) - actor/pg_clipfrac:np.float64(0.0008407529021496885) - actor/ppo_kl:np.float64(8.699073100615351e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.322283998131752) - perf/mfu/actor:np.float64(0.20631704939361004) - perf/max_memory_allocated_gb:np.float64(74.2978835105896) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(805.002384185791) - actor/lr:np.float64(1e-06) - training/global_step:54 - training/epoch:8 - critic/score/mean:-4.700043201446533 - critic/score/max:-4.186136722564697 - critic/score/min:-6.651907444000244 - critic/rewards/mean:-4.700043201446533 - critic/rewards/max:-4.186136722564697 - critic/rewards/min:-6.651907444000244 - critic/advantages/mean:0.00455153314396739 - critic/advantages/max:1.1546995639801025 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.00455153314396739 - critic/returns/max:1.1546995639801025 - critic/returns/min:-1.1546999216079712 - response_length/mean:403.1263122558594 - response_length/max:595.0 - response_length/min:265.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:403.1263122558594 - response_length_non_aborted/max:595.0 - response_length_non_aborted/min:265.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.502307951450348e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.002650141716003) - timing_s/agent_loop/generate_sequences/max:np.float64(39.91969962604344) - timing_s/agent_loop/generate_sequences/mean:np.float64(34.72484899800596) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(39.91969962604344) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:595 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:812.3667445611209 - timing_s/reward:0.00018676556646823883 - timing_s/old_log_prob:54.46342660859227 - timing_s/ref:60.221918074414134 - timing_s/adv:0.12287429347634315 - timing_s/update_actor:297.8658072333783 - timing_s/update_weights:19.206705685704947 - timing_s/step:1245.0418245494366 - timing_s/stop_profile:0.00021226704120635986 - timing_per_token_ms/ref:0.04298415452276698 - timing_per_token_ms/update_actor:0.21260548143530406 - timing_per_token_ms/adv:8.770307865545904e-05 - timing_per_token_ms/gen:1.3119575591828205 - perf/total_num_tokens:1401026 - perf/time_per_step:1245.0418245494366 - perf/throughput:562.6421427677789 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 55} +validation generation end +step:55 - global_seqlen/min:709005 - global_seqlen/max:709276 - global_seqlen/minmax_diff:271 - global_seqlen/balanced_min:709140 - global_seqlen/balanced_max:709141 - global_seqlen/mean:709140.5 - actor/entropy:0.4182867109775543 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00100061814494741) - actor/kl_loss:np.float64(0.3188770258178314) - actor/pg_clipfrac:np.float64(0.0008918741538460987) - actor/ppo_kl:np.float64(0.00022644570621347762) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.333443745970726) - perf/mfu/actor:np.float64(0.20094538109757037) - perf/max_memory_allocated_gb:np.float64(74.45402526855469) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(794.8800506591797) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.694728998576894) - val-core/multiclinsum/acc/mean@1:np.float64(-4.694729024530053) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:55 - training/epoch:9 - critic/score/mean:-4.68239688873291 - critic/score/max:-3.9163999557495117 - critic/score/min:-6.65609073638916 - critic/rewards/mean:-4.68239688873291 - critic/rewards/max:-3.9163999557495117 - critic/rewards/min:-6.65609073638916 - critic/advantages/mean:0.0026545177679508924 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0026545177679508924 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546999216079712 - response_length/mean:414.3600158691406 - response_length/max:641.0 - response_length/min:248.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:414.3600158691406 - response_length_non_aborted/max:641.0 - response_length_non_aborted/min:248.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0004948172718286514 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.158608242869377) - timing_s/agent_loop/generate_sequences/max:np.float64(45.80425193719566) - timing_s/agent_loop/generate_sequences/mean:np.float64(39.33163471563239) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(45.80425193719566) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:641 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:797.406694997102 - timing_s/reward:0.00019285082817077637 - timing_s/old_log_prob:54.696571780368686 - timing_s/ref:61.20078814215958 - timing_s/adv:0.11408025771379471 - timing_s/update_actor:309.7049307245761 - timing_s/update_weights:18.703609511256218 - timing_s/step:1242.6241091173142 - timing_s/testing:120.91760691255331 - timing_s/stop_profile:0.0001603737473487854 - timing_per_token_ms/ref:0.04315138406434239 - timing_per_token_ms/update_actor:0.21836641027030335 - timing_per_token_ms/adv:8.043558202767626e-05 - timing_per_token_ms/gen:1.25288384760809 - perf/total_num_tokens:1418281 - perf/time_per_step:1242.6241091173142 - perf/throughput:570.6798176511568 +step:56 - global_seqlen/min:713420 - global_seqlen/max:713478 - global_seqlen/minmax_diff:58 - global_seqlen/balanced_min:713449 - global_seqlen/balanced_max:713449 - global_seqlen/mean:713449.0 - actor/entropy:0.41691213846206665 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0006702266534072514) - actor/kl_loss:np.float64(0.31936811159054435) - actor/pg_clipfrac:np.float64(0.0011391128379424724) - actor/ppo_kl:np.float64(1.0882017780507644e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.3867972642183304) - perf/mfu/actor:np.float64(0.1686939655750308) - perf/max_memory_allocated_gb:np.float64(74.45402526855469) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(793.6258811950684) - actor/lr:np.float64(1e-06) - training/global_step:56 - training/epoch:9 - critic/score/mean:-4.707076072692871 - critic/score/max:-3.0 - critic/score/min:-6.65576171875 - critic/rewards/mean:-4.707076072692871 - critic/rewards/max:-3.0 - critic/rewards/min:-6.65576171875 - critic/advantages/mean:0.002697875490412116 - critic/advantages/max:1.154699444770813 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.002697875490412116 - critic/returns/max:1.154699444770813 - critic/returns/min:-1.1546998023986816 - response_length/mean:419.9700622558594 - response_length/max:651.0 - response_length/min:258.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:419.9700622558594 - response_length_non_aborted/max:651.0 - response_length_non_aborted/min:258.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.182678043842316e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(21.96167579293251) - timing_s/agent_loop/generate_sequences/max:np.float64(43.43342655710876) - timing_s/agent_loop/generate_sequences/mean:np.float64(38.12904530791275) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(43.43342655710876) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:568 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:528.9945937674493 - timing_s/reward:0.0002488084137439728 - timing_s/old_log_prob:55.253244053572416 - timing_s/ref:61.49766239710152 - timing_s/adv:0.11963904462754726 - timing_s/update_actor:370.9537728205323 - timing_s/update_weights:18.805028941482306 - timing_s/step:1036.654519194737 - timing_s/stop_profile:0.0002435985952615738 - timing_per_token_ms/ref:0.04309884967047506 - timing_per_token_ms/update_actor:0.25997217237709513 - timing_per_token_ms/adv:8.384554791410967e-05 - timing_per_token_ms/gen:0.8200525734527345 - perf/total_num_tokens:1426898 - perf/time_per_step:1036.654519194737 - perf/throughput:688.2225339201724 +step:57 - global_seqlen/min:722261 - global_seqlen/max:723013 - global_seqlen/minmax_diff:752 - global_seqlen/balanced_min:722611 - global_seqlen/balanced_max:722663 - global_seqlen/mean:722637.0 - actor/entropy:0.411339670419693 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(8.483618148602892e-05) - actor/kl_loss:np.float64(0.32090058301885926) - actor/pg_clipfrac:np.float64(0.0012023215288839613) - actor/ppo_kl:np.float64(0.00028077602814846614) - actor/pg_clipfrac_lower:np.float64(1.495465767220594e-06) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.36357875168323517) - perf/mfu/actor:np.float64(0.15234600921717034) - perf/max_memory_allocated_gb:np.float64(74.83380031585693) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(790.6411418914795) - actor/lr:np.float64(1e-06) - training/global_step:57 - training/epoch:9 - critic/score/mean:-4.705036640167236 - critic/score/max:-3.6400842666625977 - critic/score/min:-6.6583356857299805 - critic/rewards/mean:-4.705036640167236 - critic/rewards/max:-3.6400842666625977 - critic/rewards/min:-6.6583356857299805 - critic/advantages/mean:0.0013303790474310517 - critic/advantages/max:1.1546999216079712 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0013303790474310517 - critic/returns/max:1.1546999216079712 - critic/returns/min:-1.1546998023986816 - response_length/mean:431.93359375 - response_length/max:806.0 - response_length/min:267.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:431.93359375 - response_length_non_aborted/max:806.0 - response_length_non_aborted/min:267.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00010305270552635193 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(25.108676893636584) - timing_s/agent_loop/generate_sequences/max:np.float64(48.84695467725396) - timing_s/agent_loop/generate_sequences/mean:np.float64(42.17134715287951) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(48.84695467725396) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:806 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:519.179569805041 - timing_s/reward:0.00030428171157836914 - timing_s/old_log_prob:55.21774177066982 - timing_s/ref:61.93451111763716 - timing_s/adv:0.1834796406328678 - timing_s/update_actor:416.1396218277514 - timing_s/update_weights:22.034963253885508 - timing_s/step:1075.5846728291363 - timing_s/stop_profile:0.00023746676743030548 - timing_per_token_ms/ref:0.04285312758524484 - timing_per_token_ms/update_actor:0.28793130010485996 - timing_per_token_ms/adv:0.00012695145739345467 - timing_per_token_ms/gen:0.7825451349838586 - perf/total_num_tokens:1445274 - perf/time_per_step:1075.5846728291363 - perf/throughput:671.8550554455471 +step:58 - global_seqlen/min:724873 - global_seqlen/max:726752 - global_seqlen/minmax_diff:1879 - global_seqlen/balanced_min:725812 - global_seqlen/balanced_max:725813 - global_seqlen/mean:725812.5 - actor/entropy:0.4053240120410919 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-4.3136421785073056e-05) - actor/kl_loss:np.float64(0.3228552627066772) - actor/pg_clipfrac:np.float64(0.0008525027697032783) - actor/ppo_kl:np.float64(7.736939369351603e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.28031718730926514) - perf/mfu/actor:np.float64(0.15127253623296982) - perf/max_memory_allocated_gb:np.float64(75.16064167022705) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(789.877197265625) - actor/lr:np.float64(1e-06) - training/global_step:58 - training/epoch:9 - critic/score/mean:-4.711802959442139 - critic/score/max:-3.6663999557495117 - critic/score/min:-6.657309055328369 - critic/rewards/mean:-4.711802959442139 - critic/rewards/max:-3.6663999557495117 - critic/rewards/min:-6.657309055328369 - critic/advantages/mean:0.0015606521628797054 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1547000408172607 - critic/returns/mean:0.0015606521628797054 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1547000408172607 - response_length/mean:436.068359375 - response_length/max:613.0 - response_length/min:265.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:436.068359375 - response_length_non_aborted/max:613.0 - response_length_non_aborted/min:265.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:7.087923586368561e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.768994065001607) - timing_s/agent_loop/generate_sequences/max:np.float64(47.166289588436484) - timing_s/agent_loop/generate_sequences/mean:np.float64(41.43645663351466) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(47.166289588436484) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:607 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:770.8718810901046 - timing_s/reward:0.00026206672191619873 - timing_s/old_log_prob:55.34656658023596 - timing_s/ref:62.017607390880585 - timing_s/adv:0.13097251392900944 - timing_s/update_actor:420.8795684091747 - timing_s/update_weights:24.814998192712665 - timing_s/step:1334.8783499747515 - timing_s/stop_profile:0.00023277290165424347 - timing_per_token_ms/ref:0.04272288462301255 - timing_per_token_ms/update_actor:0.28993684209708065 - timing_per_token_ms/adv:9.022475772255882e-05 - timing_per_token_ms/gen:1.1508968799540529 - perf/total_num_tokens:1451625 - perf/time_per_step:1334.8783499747515 - perf/throughput:543.7293218619722 +step:59 - global_seqlen/min:719291 - global_seqlen/max:725626 - global_seqlen/minmax_diff:6335 - global_seqlen/balanced_min:722458 - global_seqlen/balanced_max:722459 - global_seqlen/mean:722458.5 - actor/entropy:0.3993646204471588 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0003620398977849) - actor/kl_loss:np.float64(0.3210269628713528) - actor/pg_clipfrac:np.float64(0.000989803976456945) - actor/ppo_kl:np.float64(3.960580672905204e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2998743802309036) - perf/mfu/actor:np.float64(0.14549138188002916) - perf/max_memory_allocated_gb:np.float64(75.16064167022705) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(785.8796882629395) - actor/lr:np.float64(1e-06) - training/global_step:59 - training/epoch:9 - critic/score/mean:-4.667497158050537 - critic/score/max:0.0 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.667497158050537 - critic/rewards/max:0.0 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:0.0036389983724802732 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.0036389983724802732 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546999216079712 - response_length/mean:431.701171875 - response_length/max:637.0 - response_length/min:240.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:431.701171875 - response_length_non_aborted/max:637.0 - response_length_non_aborted/min:240.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:8.925609290599823e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(22.40810678899288) - timing_s/agent_loop/generate_sequences/max:np.float64(48.67683762870729) - timing_s/agent_loop/generate_sequences/mean:np.float64(42.53744651919745) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(48.67683762870729) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:595 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:762.2154413275421 - timing_s/reward:0.00740097276866436 - timing_s/old_log_prob:55.739321146160364 - timing_s/ref:63.56138164177537 - timing_s/adv:0.110495550557971 - timing_s/update_actor:435.5310457777232 - timing_s/update_weights:23.81368323788047 - timing_s/step:1341.8691854253411 - timing_s/stop_profile:0.0002996176481246948 - timing_per_token_ms/ref:0.04398964206371395 - timing_per_token_ms/update_actor:0.3014228815757052 - timing_per_token_ms/adv:7.647190154034522e-05 - timing_per_token_ms/gen:1.1494849762062669 - perf/total_num_tokens:1444917 - perf/time_per_step:1341.8691854253411 - perf/throughput:538.3971163858254 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 60} +validation generation end +local_global_step_folder: /home/mshahidul/readctrl/code/RL_model/RL_model/global_step_60 +Warning: remove_previous_ckpt_in_save is deprecated, set max_actor_ckpt_to_keep=1 and max_critic_ckpt_to_keep=1 instead +step:60 - global_seqlen/min:722476 - global_seqlen/max:723111 - global_seqlen/minmax_diff:635 - global_seqlen/balanced_min:722793 - global_seqlen/balanced_max:722794 - global_seqlen/mean:722793.5 - actor/entropy:0.39816075563430786 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0004114592641902431) - actor/kl_loss:np.float64(0.32904384223123395) - actor/pg_clipfrac:np.float64(0.001024293334921822) - actor/ppo_kl:np.float64(0.00012798451435950634) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.29278530180454254) - perf/mfu/actor:np.float64(0.15130566141937374) - perf/max_memory_allocated_gb:np.float64(75.16064167022705) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(785.277587890625) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.680191564559936) - val-core/multiclinsum/acc/mean@1:np.float64(-4.680191588661088) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:60 - training/epoch:9 - critic/score/mean:-4.69797945022583 - critic/score/max:-3.7316999435424805 - critic/score/min:-6.654054164886475 - critic/rewards/mean:-4.69797945022583 - critic/rewards/max:-3.7316999435424805 - critic/rewards/min:-6.654054164886475 - critic/advantages/mean:0.004703603219240904 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.004703603219240904 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:432.1373596191406 - response_length/max:616.0 - response_length/min:248.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:432.1373596191406 - response_length_non_aborted/max:616.0 - response_length_non_aborted/min:248.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:8.780881762504578e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.25878264196217) - timing_s/agent_loop/generate_sequences/max:np.float64(47.81701294705272) - timing_s/agent_loop/generate_sequences/mean:np.float64(42.10669604732296) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(47.81701294705272) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:601 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:533.4523629900068 - timing_s/reward:0.00020674802362918854 - timing_s/old_log_prob:56.20452526025474 - timing_s/ref:64.15551071986556 - timing_s/adv:0.10772703774273396 - timing_s/update_actor:419.09799506515265 - timing_s/update_weights:25.278261076658964 - timing_s/step:1099.1117200255394 - timing_s/testing:124.78820899873972 - timing_s/save_checkpoint:137.9368465486914 - timing_s/stop_profile:0.0002896096557378769 - timing_per_token_ms/ref:0.044380248798491936 - timing_per_token_ms/update_actor:0.2899154426991614 - timing_per_token_ms/adv:7.452131054217695e-05 - timing_per_token_ms/gen:0.8036789682311409 - perf/total_num_tokens:1445587 - perf/time_per_step:1099.1117200255394 - perf/throughput:657.6160428743357 +step:61 - global_seqlen/min:729078 - global_seqlen/max:733124 - global_seqlen/minmax_diff:4046 - global_seqlen/balanced_min:731101 - global_seqlen/balanced_max:731101 - global_seqlen/mean:731101.0 - actor/entropy:0.39293429255485535 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(6.531880838641724e-05) - actor/kl_loss:np.float64(0.32687242018679774) - actor/pg_clipfrac:np.float64(0.00092739361692414) - actor/ppo_kl:np.float64(4.256101731433167e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2759390026330948) - perf/mfu/actor:np.float64(0.25692270063869277) - perf/max_memory_allocated_gb:np.float64(75.42926216125488) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(779.9527339935303) - actor/lr:np.float64(1e-06) - training/global_step:61 - training/epoch:10 - critic/score/mean:-4.691617965698242 - critic/score/max:-3.6137683391571045 - critic/score/min:-6.657309055328369 - critic/rewards/mean:-4.691617965698242 - critic/rewards/max:-3.6137683391571045 - critic/rewards/min:-6.657309055328369 - critic/advantages/mean:0.0019376679556444287 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0019376679556444287 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:442.9544372558594 - response_length/max:644.0 - response_length/min:247.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:442.9544372558594 - response_length_non_aborted/max:644.0 - response_length_non_aborted/min:247.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0005176924169063568 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.691338935866952) - timing_s/agent_loop/generate_sequences/max:np.float64(49.69136776775122) - timing_s/agent_loop/generate_sequences/mean:np.float64(43.83408540365417) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(49.69136776775122) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:595 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:832.2776970025152 - timing_s/reward:0.0003482699394226074 - timing_s/old_log_prob:57.204485695809126 - timing_s/ref:70.25385226681828 - timing_s/adv:0.12505370378494263 - timing_s/update_actor:250.4351920634508 - timing_s/update_weights:24.534404516220093 - timing_s/step:1235.6625038720667 - timing_s/stop_profile:0.00015634112060070038 - timing_per_token_ms/ref:0.04804661207331017 - timing_per_token_ms/update_actor:0.17127263679262567 - timing_per_token_ms/adv:8.552423248288719e-05 - timing_per_token_ms/gen:1.2232578022841938 - perf/total_num_tokens:1462202 - perf/time_per_step:1235.6625038720667 - perf/throughput:591.6672211943188 +step:62 - global_seqlen/min:728996 - global_seqlen/max:729435 - global_seqlen/minmax_diff:439 - global_seqlen/balanced_min:729215 - global_seqlen/balanced_max:729216 - global_seqlen/mean:729215.5 - actor/entropy:0.3877805769443512 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-1.5832601396441673e-05) - actor/kl_loss:np.float64(0.3334187790751457) - actor/pg_clipfrac:np.float64(0.0009343603038966345) - actor/ppo_kl:np.float64(8.663996004543151e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.30690234899520874) - perf/mfu/actor:np.float64(0.1330513388790034) - perf/max_memory_allocated_gb:np.float64(75.42926216125488) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(781.0827026367188) - actor/lr:np.float64(1e-06) - training/global_step:62 - training/epoch:10 - critic/score/mean:-4.694757461547852 - critic/score/max:-3.9235427379608154 - critic/score/min:-6.652511119842529 - critic/rewards/mean:-4.694757461547852 - critic/rewards/max:-3.9235427379608154 - critic/rewards/min:-6.652511119842529 - critic/advantages/mean:0.004479893948882818 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.004479893948882818 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:440.4993591308594 - response_length/max:620.0 - response_length/min:280.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:440.4993591308594 - response_length_non_aborted/max:620.0 - response_length_non_aborted/min:280.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:9.190291166305542e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(25.81319958344102) - timing_s/agent_loop/generate_sequences/max:np.float64(47.791228922083974) - timing_s/agent_loop/generate_sequences/mean:np.float64(42.02339144296881) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(47.791228922083974) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:620 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:560.7903906777501 - timing_s/reward:0.0001968052238225937 - timing_s/old_log_prob:56.56328650563955 - timing_s/ref:66.63313028775156 - timing_s/adv:0.114428685978055 - timing_s/update_actor:480.86126119829714 - timing_s/update_weights:25.63907800987363 - timing_s/step:1191.447736410424 - timing_s/stop_profile:0.0001700781285762787 - timing_per_token_ms/ref:0.04568822953417169 - timing_per_token_ms/update_actor:0.32971135500979964 - timing_per_token_ms/adv:7.846013008366868e-05 - timing_per_token_ms/gen:0.8288273557290275 - perf/total_num_tokens:1458431 - perf/time_per_step:1191.447736410424 - perf/throughput:612.0415337704778 +step:63 - global_seqlen/min:732337 - global_seqlen/max:733308 - global_seqlen/minmax_diff:971 - global_seqlen/balanced_min:732822 - global_seqlen/balanced_max:732823 - global_seqlen/mean:732822.5 - actor/entropy:0.3853428065776825 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0006700030304879513) - actor/kl_loss:np.float64(0.33334633335471153) - actor/pg_clipfrac:np.float64(0.0008001133483048761) - actor/ppo_kl:np.float64(5.691573064344387e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.2810823768377304) - perf/mfu/actor:np.float64(0.13878515421119064) - perf/max_memory_allocated_gb:np.float64(75.42926216125488) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(784.1409568786621) - actor/lr:np.float64(1e-06) - training/global_step:63 - training/epoch:10 - critic/score/mean:-4.702020168304443 - critic/score/max:-3.794605016708374 - critic/score/min:-6.659307956695557 - critic/rewards/mean:-4.702020168304443 - critic/rewards/max:-3.794605016708374 - critic/rewards/min:-6.659307956695557 - critic/advantages/mean:0.0035488230641931295 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0035488230641931295 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:445.1959533691406 - response_length/max:659.0 - response_length/min:288.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:445.1959533691406 - response_length_non_aborted/max:659.0 - response_length_non_aborted/min:288.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.00016757100820541382 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(26.327550856396556) - timing_s/agent_loop/generate_sequences/max:np.float64(48.85462132282555) - timing_s/agent_loop/generate_sequences/mean:np.float64(43.051606088757886) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(48.85462132282555) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:554 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:497.66074243560433 - timing_s/reward:0.0004639197140932083 - timing_s/old_log_prob:56.36225847899914 - timing_s/ref:67.43412862531841 - timing_s/adv:0.08103403076529503 - timing_s/update_actor:463.3386064209044 - timing_s/update_weights:30.308901648968458 - timing_s/step:1116.0426472909749 - timing_s/stop_profile:0.0002814978361129761 - timing_per_token_ms/ref:0.046009865025513276 - timing_per_token_ms/update_actor:0.3161329015013215 - timing_per_token_ms/adv:5.528898932913156e-05 - timing_per_token_ms/gen:0.7277646378739528 - perf/total_num_tokens:1465645 - perf/time_per_step:1116.0426472909749 - perf/throughput:656.625893086448 +step:64 - global_seqlen/min:740422 - global_seqlen/max:740599 - global_seqlen/minmax_diff:177 - global_seqlen/balanced_min:740510 - global_seqlen/balanced_max:740511 - global_seqlen/mean:740510.5 - actor/entropy:0.38391798734664917 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.0005079726349019134) - actor/kl_loss:np.float64(0.33567978317538905) - actor/pg_clipfrac:np.float64(0.0009505360418794832) - actor/ppo_kl:np.float64(9.118288131541401e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.26950685679912567) - perf/mfu/actor:np.float64(0.13806193919390916) - perf/max_memory_allocated_gb:np.float64(76.15307903289795) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(782.9653263092041) - actor/lr:np.float64(1e-06) - training/global_step:64 - training/epoch:10 - critic/score/mean:-4.690016269683838 - critic/score/max:-4.166399955749512 - critic/score/min:-6.654351711273193 - critic/rewards/mean:-4.690016269683838 - critic/rewards/max:-4.166399955749512 - critic/rewards/min:-6.654351711273193 - critic/advantages/mean:0.0009980278555303812 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0009980278555303812 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:455.2063903808594 - response_length/max:643.0 - response_length/min:274.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:455.2063903808594 - response_length_non_aborted/max:643.0 - response_length_non_aborted/min:274.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0001256018877029419 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(24.12436388991773) - timing_s/agent_loop/generate_sequences/max:np.float64(51.40347355231643) - timing_s/agent_loop/generate_sequences/mean:np.float64(44.1188400205477) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(51.40347355231643) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:633 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:531.2316282335669 - timing_s/reward:0.00019377470016479492 - timing_s/old_log_prob:58.102232184261084 - timing_s/ref:68.09213111922145 - timing_s/adv:0.12626998126506805 - timing_s/update_actor:470.7625984735787 - timing_s/update_weights:23.01259026490152 - timing_s/step:1152.2169116772711 - timing_s/stop_profile:0.00022638775408267975 - timing_per_token_ms/ref:0.0459764791446046 - timing_per_token_ms/update_actor:0.3178635539088093 - timing_per_token_ms/adv:8.525873790112905e-05 - timing_per_token_ms/gen:0.7597738952449266 - perf/total_num_tokens:1481021 - perf/time_per_step:1152.2169116772711 - perf/throughput:642.6832417535393 +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 65} +validation generation end +step:65 - global_seqlen/min:740968 - global_seqlen/max:743464 - global_seqlen/minmax_diff:2496 - global_seqlen/balanced_min:742216 - global_seqlen/balanced_max:742216 - global_seqlen/mean:742216.0 - actor/entropy:0.38282015919685364 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(-0.00042520129257658107) - actor/kl_loss:np.float64(0.34532437225182855) - actor/pg_clipfrac:np.float64(0.0011185471982268307) - actor/ppo_kl:np.float64(4.433744945941953e-05) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.30435338616371155) - perf/mfu/actor:np.float64(0.1285525503968774) - perf/max_memory_allocated_gb:np.float64(76.15307903289795) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(787.7571277618408) - actor/lr:np.float64(1e-06) - val-aux/multiclinsum/reward/mean@1:np.float64(-4.670016354673049) - val-core/multiclinsum/acc/mean@1:np.float64(-4.670016379491017) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) - training/global_step:65 - training/epoch:10 - critic/score/mean:-4.696645736694336 - critic/score/max:-4.15031623840332 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.696645736694336 - critic/rewards/max:-4.15031623840332 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:0.005235583521425724 - critic/advantages/max:1.154699683189392 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.005235583521425724 - critic/returns/max:1.154699683189392 - critic/returns/min:-1.1546998023986816 - response_length/mean:457.4270935058594 - response_length/max:658.0 - response_length/min:295.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:457.4270935058594 - response_length_non_aborted/max:658.0 - response_length_non_aborted/min:295.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0002534426748752594 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(26.245533742010593) - timing_s/agent_loop/generate_sequences/max:np.float64(50.38546037301421) - timing_s/agent_loop/generate_sequences/mean:np.float64(44.049633324290575) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(50.38546037301421) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:615 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:551.408677496016 - timing_s/reward:0.0003407355397939682 - timing_s/old_log_prob:56.83834129758179 - timing_s/ref:68.62898791767657 - timing_s/adv:0.12451598979532719 - timing_s/update_actor:506.7615706846118 - timing_s/update_weights:26.746712811291218 - timing_s/step:1211.3883149456233 - timing_s/testing:161.2632163669914 - timing_s/stop_profile:0.00013137981295585632 - timing_per_token_ms/ref:0.046232490216915674 - timing_per_token_ms/update_actor:0.3413841595200129 - timing_per_token_ms/adv:8.388123524373442e-05 - timing_per_token_ms/gen:0.7848027313893609 - perf/total_num_tokens:1484432 - perf/time_per_step:1211.3883149456233 - perf/throughput:612.6986622232001 +step:66 - global_seqlen/min:744014 - global_seqlen/max:744722 - global_seqlen/minmax_diff:708 - global_seqlen/balanced_min:744368 - global_seqlen/balanced_max:744368 - global_seqlen/mean:744368.0 - actor/entropy:0.3800479769706726 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.00020732540967098242) - actor/kl_loss:np.float64(0.34672634551922477) - actor/pg_clipfrac:np.float64(0.000900313180560867) - actor/ppo_kl:np.float64(0.00011454703303570568) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.27117905020713806) - perf/mfu/actor:np.float64(0.12843348564158819) - perf/max_memory_allocated_gb:np.float64(76.30922079086304) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(784.6429786682129) - actor/lr:np.float64(1e-06) - training/global_step:66 - training/epoch:10 - critic/score/mean:-4.67031717300415 - critic/score/max:0.0 - critic/score/min:-6.658824443817139 - critic/rewards/mean:-4.67031717300415 - critic/rewards/max:0.0 - critic/rewards/min:-6.658824443817139 - critic/advantages/mean:0.001002822769805789 - critic/advantages/max:1.1546993255615234 - critic/advantages/min:-1.1546999216079712 - critic/returns/mean:0.001002822769805789 - critic/returns/max:1.1546993255615234 - critic/returns/min:-1.1546999216079712 - response_length/mean:460.2291564941406 - response_length/max:661.0 - response_length/min:261.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:460.2291564941406 - response_length_non_aborted/max:661.0 - response_length_non_aborted/min:261.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:6.473623216152191e-05 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.101801546290517) - timing_s/agent_loop/generate_sequences/max:np.float64(51.148000333458185) - timing_s/agent_loop/generate_sequences/mean:np.float64(44.60728419449151) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(51.148000333458185) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:638 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:536.6997501719743 - timing_s/reward:0.0002826191484928131 - timing_s/old_log_prob:58.35410931892693 - timing_s/ref:65.46710864827037 - timing_s/adv:0.1076844111084938 - timing_s/update_actor:508.8071043435484 - timing_s/update_weights:28.46160177886486 - timing_s/step:1198.6944473572075 - timing_s/stop_profile:0.0002249646931886673 - timing_per_token_ms/ref:0.04397496174491002 - timing_per_token_ms/update_actor:0.34177121016993506 - timing_per_token_ms/adv:7.233277834921289e-05 - timing_per_token_ms/gen:0.7592172012527363 - perf/total_num_tokens:1488736 - perf/time_per_step:1198.6944473572075 - perf/throughput:620.9822708707188 +step:67 - global_seqlen/min:735804 - global_seqlen/max:737352 - global_seqlen/minmax_diff:1548 - global_seqlen/balanced_min:736578 - global_seqlen/balanced_max:736578 - global_seqlen/mean:736578.0 - actor/entropy:0.3762032091617584 - perf/mfu/actor_infer:0 - actor/pg_loss:np.float64(0.0005189485455048218) - actor/kl_loss:np.float64(0.3505901601165533) - actor/pg_clipfrac:np.float64(0.0010491076294177522) - actor/ppo_kl:np.float64(0.00021311884781501553) - actor/pg_clipfrac_lower:np.float64(0.0) - actor/kl_coef:np.float64(0.001) - actor/grad_norm:np.float64(0.30628491938114166) - perf/mfu/actor:np.float64(0.12799852017988836) - perf/max_memory_allocated_gb:np.float64(76.30922079086304) - perf/max_memory_reserved_gb:np.float64(77.048828125) - perf/cpu_memory_used_gb:np.float64(784.6223201751709) - actor/lr:np.float64(1e-06) - training/global_step:67 - training/epoch:11 - critic/score/mean:-4.6939568519592285 - critic/score/max:-4.139372825622559 - critic/score/min:-6.656299114227295 - critic/rewards/mean:-4.6939568519592285 - critic/rewards/max:-4.139372825622559 - critic/rewards/min:-6.656299114227295 - critic/advantages/mean:0.0029598698019981384 - critic/advantages/max:1.1546998023986816 - critic/advantages/min:-1.1546998023986816 - critic/returns/mean:0.0029598698019981384 - critic/returns/max:1.1546998023986816 - critic/returns/min:-1.1546998023986816 - response_length/mean:450.0859375 - response_length/max:647.0 - response_length/min:269.0 - response_length/clip_ratio:0.0 - response_length_non_aborted/mean:450.0859375 - response_length_non_aborted/max:647.0 - response_length_non_aborted/min:269.0 - response_length_non_aborted/clip_ratio:0.0 - response/aborted_ratio:0.0 - prompt_length/mean:509.0 - prompt_length/max:509.0 - prompt_length/min:509.0 - prompt_length/clip_ratio:0.0 - num_turns/min:np.int32(2) - num_turns/max:np.int32(2) - num_turns/mean:np.float64(2.0) - timing_s/start_profile:0.0004110913723707199 - timing_s/agent_loop/num_preempted/min:np.int64(-1) - timing_s/agent_loop/num_preempted/max:np.int64(-1) - timing_s/agent_loop/num_preempted/mean:np.float64(-1.0) - timing_s/agent_loop/generate_sequences/min:np.float64(23.986371506005526) - timing_s/agent_loop/generate_sequences/max:np.float64(49.89639197476208) - timing_s/agent_loop/generate_sequences/mean:np.float64(43.192123773299194) - timing_s/agent_loop/tool_calls/min:np.float64(0.0) - timing_s/agent_loop/tool_calls/max:np.float64(0.0) - timing_s/agent_loop/tool_calls/mean:np.float64(0.0) - timing_s/agent_loop/slowest/generate_sequences:np.float64(49.89639197476208) - timing_s/agent_loop/slowest/tool_calls:np.float64(0.0) - timing_s/agent_loop/slowest/prompt_length:509 - timing_s/agent_loop/slowest/response_length:595 - timing_s/agent_loop/slowest/num_preempted:np.int64(-1) - timing_s/gen:524.2807809002697 - timing_s/reward:0.0003207940608263016 - timing_s/old_log_prob:57.421982407569885 - timing_s/ref:66.02561225369573 - timing_s/adv:0.11871197633445263 - timing_s/update_actor:504.9631587713957 - timing_s/update_weights:30.141695799306035 - timing_s/step:1183.7784282360226 - timing_s/stop_profile:0.00030366890132427216 - timing_per_token_ms/ref:0.044819158496245966 - timing_per_token_ms/update_actor:0.34277643289060744 - timing_per_token_ms/adv:8.058343877664865e-05 - timing_per_token_ms/gen:0.7583632479044363 - perf/total_num_tokens:1473156 - perf/time_per_step:1183.7784282360226 - perf/throughput:622.2262396668209 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf418057931b776b6526f866272241becb58c9a --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T14:52:27.364301Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=44173", + "--object-store-name=/tmp/ray/session_2026-02-02_09-46-37_229579_2792949/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-02_09-46-37_229579_2792949/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=59209", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=65414", + "--gcs-address=172.16.34.29:63816", + "--session-name=session_2026-02-02_09-46-37_229579_2792949", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=39a2162a1a775a0c652ad9a25f459ddc87ae0e16c7e2ce61a2db083d", + "--startup-token=128", + "--worker-launch-time-ms=1770043610629", + "--node-id=b8834a3d730307500a971f42611c49920f7a913a7105abfd56fbc999", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "182962978816" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "kfar92kzq2tt3vn3peb4zpbz23c0q97e" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..22631bd87c840e6ab3672c13199d361f10533197 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/files/wandb-summary.json @@ -0,0 +1 @@ +{"val-aux/multiclinsum/reward/mean@1":-4.670016354673049,"timing_s/gen":524.2807809002697,"response_length_non_aborted/min":269,"global_seqlen/max":737352,"global_seqlen/balanced_max":736578,"training/global_step":67,"timing_per_token_ms/gen":0.7583632479044363,"timing_s/agent_loop/tool_calls/max":0,"global_seqlen/mean":736578,"val-core/multiclinsum/acc/mean@1":-4.670016379491017,"perf/max_memory_reserved_gb":77.048828125,"training/epoch":11,"response_length/clip_ratio":0,"critic/returns/mean":0.0029598698019981384,"timing_s/agent_loop/generate_sequences/mean":43.192123773299194,"_wandb":{"runtime":76237},"actor/kl_coef":0.001,"perf/max_memory_allocated_gb":76.30922079086304,"global_seqlen/min":735804,"timing_s/agent_loop/tool_calls/mean":0,"timing_s/update_weights":30.141695799306035,"actor/grad_norm":0.30628491938114166,"response/aborted_ratio":0,"actor/pg_clipfrac_lower":0,"timing_s/save_checkpoint":137.9368465486914,"timing_s/start_profile":0.0004110913723707199,"critic/rewards/max":-4.139372825622559,"global_seqlen/minmax_diff":1548,"_runtime":76237.64345447,"critic/returns/max":1.1546998023986816,"critic/rewards/mean":-4.6939568519592285,"critic/advantages/max":1.1546998023986816,"critic/rewards/min":-6.656299114227295,"timing_per_token_ms/update_actor":0.34277643289060744,"timing_s/ref":66.02561225369573,"perf/throughput":622.2262396668209,"timing_s/adv":0.11871197633445263,"global_seqlen/balanced_min":736578,"val-aux/num_turns/min":2,"perf/total_num_tokens":1473156,"prompt_length/mean":509,"_timestamp":1.7701192637142365e+09,"response_length_non_aborted/clip_ratio":0,"prompt_length/clip_ratio":0,"critic/returns/min":-1.1546998023986816,"response_length_non_aborted/mean":450.0859375,"response_length/max":647,"_step":67,"prompt_length/max":509,"timing_per_token_ms/adv":8.058343877664865e-05,"num_turns/min":2,"timing_s/testing":161.2632163669914,"critic/score/min":-6.656299114227295,"perf/mfu/actor_infer":0,"timing_s/agent_loop/num_preempted/mean":-1,"perf/mfu/actor":0.12799852017988836,"perf/cpu_memory_used_gb":784.6223201751709,"perf/time_per_step":1183.7784282360226,"actor/entropy":0.3762032091617584,"critic/score/max":-4.139372825622559,"timing_s/stop_profile":0.00030366890132427216,"actor/kl_loss":0.3505901601165533,"critic/advantages/min":-1.1546998023986816,"timing_s/agent_loop/generate_sequences/max":49.89639197476208,"timing_s/reward":0.0003207940608263016,"val-aux/num_turns/max":2,"timing_s/agent_loop/tool_calls/min":0,"timing_s/step":1183.7784282360226,"timing_s/agent_loop/slowest/response_length":595,"response_length_non_aborted/max":647,"actor/pg_loss":0.0005189485455048218,"timing_s/agent_loop/slowest/generate_sequences":49.89639197476208,"timing_s/agent_loop/slowest/num_preempted":-1,"num_turns/mean":2,"response_length/mean":450.0859375,"actor/ppo_kl":0.00021311884781501553,"critic/score/mean":-4.6939568519592285,"timing_s/agent_loop/slowest/tool_calls":0,"timing_s/agent_loop/num_preempted/max":-1,"actor/pg_clipfrac":0.0010491076294177522,"timing_s/update_actor":504.9631587713957,"val-aux/num_turns/mean":2,"num_turns/max":2,"critic/advantages/mean":0.0029598698019981384,"timing_s/agent_loop/num_preempted/min":-1,"timing_s/agent_loop/generate_sequences/min":23.986371506005526,"timing_s/agent_loop/slowest/prompt_length":509,"prompt_length/min":509,"actor/lr":1e-06,"timing_per_token_ms/ref":0.044819158496245966,"response_length/min":269,"timing_s/old_log_prob":57.421982407569885} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..0971626befe28231f725e3043ca2c64576ff9d78 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug-core.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T09:52:27.534740429-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpg99pu6ot/port-2812782.txt","pid":2812782,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-02T09:52:27.536079763-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":2812782} +{"time":"2026-02-02T09:52:27.536065393-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2812782-2822905-474298484/socket","Net":"unix"}} +{"time":"2026-02-02T09:52:27.689797641-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-02T09:52:27.70431933-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"bx2ydf22","id":"1(@)"} +{"time":"2026-02-02T09:52:28.438291706-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"bx2ydf22","id":"1(@)"} +{"time":"2026-02-02T09:52:34.732021809-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"gk970j8qhh3q"} +{"time":"2026-02-03T07:03:06.419445692-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"gk970j8qhh3q"} +{"time":"2026-02-03T07:03:07.202351913-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"gk970j8qhh3q"} +{"time":"2026-02-03T07:03:07.207572025-05:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"bx2ydf22","id":"1(@)"} +{"time":"2026-02-03T07:03:07.211608706-05:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"bx2ydf22","id":"1(@)"} +{"time":"2026-02-03T07:03:09.236575671-05:00","level":"INFO","msg":"server: parent process exited, terminating service process"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..0f1d04a310a5e2917902bd964e432cc9cb04fac5 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T09:52:27.706582741-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-02T09:52:28.435322196-05:00","level":"INFO","msg":"stream: created new stream","id":"bx2ydf22"} +{"time":"2026-02-02T09:52:28.435590689-05:00","level":"INFO","msg":"handler: started","stream_id":"bx2ydf22"} +{"time":"2026-02-02T09:52:28.438252869-05:00","level":"INFO","msg":"stream: started","id":"bx2ydf22"} +{"time":"2026-02-02T09:52:28.438420611-05:00","level":"INFO","msg":"writer: started","stream_id":"bx2ydf22"} +{"time":"2026-02-02T09:52:28.438430838-05:00","level":"INFO","msg":"sender: started","stream_id":"bx2ydf22"} +{"time":"2026-02-03T07:03:06.998836043-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-03T07:03:07.189028317-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2026-02-03T07:03:07.207605536-05:00","level":"INFO","msg":"stream: closing","id":"bx2ydf22"} +{"time":"2026-02-03T07:03:07.207652867-05:00","level":"INFO","msg":"handler: closed","stream_id":"bx2ydf22"} +{"time":"2026-02-03T07:03:07.210097735-05:00","level":"INFO","msg":"sender: closed","stream_id":"bx2ydf22"} +{"time":"2026-02-03T07:03:07.210125841-05:00","level":"INFO","msg":"stream: closed","id":"bx2ydf22"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d576e2a770cb9de1d3d2f700ba7c29252e8be1e9 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug.log @@ -0,0 +1,24 @@ +2026-02-02 09:52:27,388 INFO MainThread:2812782 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-02 09:52:27,388 INFO MainThread:2812782 [wandb_setup.py:_flush():81] Configure stats pid to 2812782 +2026-02-02 09:52:27,388 INFO MainThread:2812782 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-02 09:52:27,389 INFO MainThread:2812782 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug.log +2026-02-02 09:52:27,389 INFO MainThread:2812782 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/logs/debug-internal.log +2026-02-02 09:52:27,389 INFO MainThread:2812782 [wandb_init.py:init():844] calling init triggers +2026-02-02 09:52:27,391 INFO MainThread:2812782 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/RL_model', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-02 09:52:27,391 INFO MainThread:2812782 [wandb_init.py:init():892] starting backend +2026-02-02 09:52:27,690 INFO MainThread:2812782 [wandb_init.py:init():895] sending inform_init request +2026-02-02 09:52:27,699 INFO MainThread:2812782 [wandb_init.py:init():903] backend started and connected +2026-02-02 09:52:27,717 INFO MainThread:2812782 [wandb_init.py:init():973] updated telemetry +2026-02-02 09:52:27,739 INFO MainThread:2812782 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-02 09:52:28,781 INFO MainThread:2812782 [wandb_init.py:init():1042] starting run threads in backend +2026-02-02 09:52:29,701 INFO MainThread:2812782 [wandb_run.py:_console_start():2529] atexit reg +2026-02-02 09:52:29,702 INFO MainThread:2812782 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-02 09:52:29,702 INFO MainThread:2812782 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-02 09:52:29,702 INFO MainThread:2812782 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-02 09:52:29,718 INFO MainThread:2812782 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-03 07:03:06,414 INFO MainThread:2812782 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/bx2ydf22 +2026-02-03 07:03:06,416 INFO MainThread:2812782 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-03 07:03:06,418 INFO MainThread:2812782 [wandb_run.py:_restore():2476] restore +2026-02-03 07:03:06,419 INFO MainThread:2812782 [wandb_run.py:_restore():2482] restore done +2026-02-03 07:03:07,203 INFO MainThread:2812782 [wandb_run.py:_footer_sync_info():3871] logging synced files diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d78af0c289eb892ae52257239a41a200ae2d6cd8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..900ceacc16b10449da267783b68e584d98277e0c --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-07T01:43:52.729516Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=41525", + "--object-store-name=/tmp/ray/session_2026-02-06_20-37-48_363201_2953054/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-06_20-37-48_363201_2953054/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=64482", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=56293", + "--gcs-address=172.16.34.29:63654", + "--session-name=session_2026-02-06_20-37-48_363201_2953054", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8297", + "--cluster-id=d8b70fb3148a2d9c54bf8d2df9fb86748ddd8d787f9c609753f9177b", + "--startup-token=128", + "--worker-launch-time-ms=1770428281682", + "--node-id=97437a8785970b9a31b8c1181fcc2122fcfc42f542f51c443ca64689", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "187275472896" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "xatoc4sccrgunnxxpi6utpvennxv92j1" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f46a236f421901f950a2575bdb1d7fccd85633e9 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-06T20:43:52.895538172-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpp9ts52dp/port-2961432.txt","pid":2961432,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-06T20:43:52.896770551-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":2961432} +{"time":"2026-02-06T20:43:52.896683558-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2961432-2973102-2322219734/socket","Net":"unix"}} +{"time":"2026-02-06T20:43:53.051587606-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-06T20:43:53.065614346-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"2so5yfiu","id":"1(@)"} +{"time":"2026-02-06T20:43:53.596919655-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"2so5yfiu","id":"1(@)"} +{"time":"2026-02-06T20:43:59.695327603-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"4w9v2kc6z0cw"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c7b98fc1768edd98021cef46e0ad035d3782d1bf --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-06T20:43:53.067264047-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-06T20:43:53.594021224-05:00","level":"INFO","msg":"stream: created new stream","id":"2so5yfiu"} +{"time":"2026-02-06T20:43:53.594173411-05:00","level":"INFO","msg":"handler: started","stream_id":"2so5yfiu"} +{"time":"2026-02-06T20:43:53.596884486-05:00","level":"INFO","msg":"stream: started","id":"2so5yfiu"} +{"time":"2026-02-06T20:43:53.597090375-05:00","level":"INFO","msg":"sender: started","stream_id":"2so5yfiu"} +{"time":"2026-02-06T20:43:53.597095923-05:00","level":"INFO","msg":"writer: started","stream_id":"2so5yfiu"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..6bf17bd352929357daae3100273c3dd60e617e2d --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-06 20:43:52,756 INFO MainThread:2961432 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-06 20:43:52,756 INFO MainThread:2961432 [wandb_setup.py:_flush():81] Configure stats pid to 2961432 +2026-02-06 20:43:52,756 INFO MainThread:2961432 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-06 20:43:52,756 INFO MainThread:2961432 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug.log +2026-02-06 20:43:52,757 INFO MainThread:2961432 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260206_204352-2so5yfiu/logs/debug-internal.log +2026-02-06 20:43:52,757 INFO MainThread:2961432 [wandb_init.py:init():844] calling init triggers +2026-02-06 20:43:52,759 INFO MainThread:2961432 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.8, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-06 20:43:52,759 INFO MainThread:2961432 [wandb_init.py:init():892] starting backend +2026-02-06 20:43:53,051 INFO MainThread:2961432 [wandb_init.py:init():895] sending inform_init request +2026-02-06 20:43:53,060 INFO MainThread:2961432 [wandb_init.py:init():903] backend started and connected +2026-02-06 20:43:53,073 INFO MainThread:2961432 [wandb_init.py:init():973] updated telemetry +2026-02-06 20:43:53,097 INFO MainThread:2961432 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-06 20:43:53,881 INFO MainThread:2961432 [wandb_init.py:init():1042] starting run threads in backend +2026-02-06 20:43:54,666 INFO MainThread:2961432 [wandb_run.py:_console_start():2529] atexit reg +2026-02-06 20:43:54,667 INFO MainThread:2961432 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-06 20:43:54,667 INFO MainThread:2961432 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-06 20:43:54,667 INFO MainThread:2961432 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-06 20:43:54,679 INFO MainThread:2961432 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..58a9e6cece3eea606c8e64afc0e67ad5b191c015 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/files/output.log @@ -0,0 +1,14 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(-5.353099036216736), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(-5.3530990323062895), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(-5.353099036216736) - val-core/multiclinsum/acc/mean@1:np.float64(-5.3530990323062895) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/90 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-07T06:36:33.048102186-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/0ndh0r3l/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e2190c20c19927122c1b5bf5f7e7d6837678acdf --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-06 20:59:01,529 INFO MainThread:3004444 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-06 20:59:01,529 INFO MainThread:3004444 [wandb_setup.py:_flush():81] Configure stats pid to 3004444 +2026-02-06 20:59:01,529 INFO MainThread:3004444 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-06 20:59:01,529 INFO MainThread:3004444 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/logs/debug.log +2026-02-06 20:59:01,530 INFO MainThread:3004444 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/logs/debug-internal.log +2026-02-06 20:59:01,530 INFO MainThread:3004444 [wandb_init.py:init():844] calling init triggers +2026-02-06 20:59:01,531 INFO MainThread:3004444 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-06 20:59:01,531 INFO MainThread:3004444 [wandb_init.py:init():892] starting backend +2026-02-06 20:59:01,822 INFO MainThread:3004444 [wandb_init.py:init():895] sending inform_init request +2026-02-06 20:59:01,831 INFO MainThread:3004444 [wandb_init.py:init():903] backend started and connected +2026-02-06 20:59:01,844 INFO MainThread:3004444 [wandb_init.py:init():973] updated telemetry +2026-02-06 20:59:01,868 INFO MainThread:3004444 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-06 20:59:02,668 INFO MainThread:3004444 [wandb_init.py:init():1042] starting run threads in backend +2026-02-06 20:59:03,446 INFO MainThread:3004444 [wandb_run.py:_console_start():2529] atexit reg +2026-02-06 20:59:03,446 INFO MainThread:3004444 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-06 20:59:03,446 INFO MainThread:3004444 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-06 20:59:03,447 INFO MainThread:3004444 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-06 20:59:03,459 INFO MainThread:3004444 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d78af0c289eb892ae52257239a41a200ae2d6cd8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ace3e2de1a2b39f86fcd6f09e9dd1b9e3a5c1075 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-07T15:34:50.097536Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=36923", + "--object-store-name=/tmp/ray/session_2026-02-07_10-27-49_585748_201487/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-07_10-27-49_585748_201487/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=62681", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=64918", + "--gcs-address=172.16.34.29:55671", + "--session-name=session_2026-02-07_10-27-49_585748_201487", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=841daf3bf517d42599bf5437920e93f71bb9f6bdbebef8d87f3e3fb8", + "--startup-token=128", + "--worker-launch-time-ms=1770478087768", + "--node-id=9443de04e0e7b889515a68e44b6547fea2664df6607ccafe65b49f74", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "188147589120" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "af0nv9x3dpfekkhrsmbujt15hq59tdje" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..a6d4e545eea1e63e4754f59c0e545922cabf10e2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-07T10:34:50.270562852-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpv2ofcc1o/port-234834.txt","pid":234834,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-07T10:34:50.272055203-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":234834} +{"time":"2026-02-07T10:34:50.272117988-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-234834-250217-273148444/socket","Net":"unix"}} +{"time":"2026-02-07T10:34:50.41166765-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-07T10:34:50.429036847-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"gjiqvndf","id":"1(@)"} +{"time":"2026-02-07T10:34:51.354718574-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"gjiqvndf","id":"1(@)"} +{"time":"2026-02-07T10:34:57.371677054-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"u3iy5baa6jjx"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e1696c748f3714c69ee061d88576e8b1eccc5ab5 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-07T10:34:50.430956456-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-07T10:34:51.352286383-05:00","level":"INFO","msg":"stream: created new stream","id":"gjiqvndf"} +{"time":"2026-02-07T10:34:51.352434377-05:00","level":"INFO","msg":"handler: started","stream_id":"gjiqvndf"} +{"time":"2026-02-07T10:34:51.354691926-05:00","level":"INFO","msg":"stream: started","id":"gjiqvndf"} +{"time":"2026-02-07T10:34:51.354727628-05:00","level":"INFO","msg":"writer: started","stream_id":"gjiqvndf"} +{"time":"2026-02-07T10:34:51.354740753-05:00","level":"INFO","msg":"sender: started","stream_id":"gjiqvndf"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2adb1eeb51ead5d5627bb06ed6559d9acf7a852c --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-07 10:34:50,120 INFO MainThread:234834 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-07 10:34:50,120 INFO MainThread:234834 [wandb_setup.py:_flush():81] Configure stats pid to 234834 +2026-02-07 10:34:50,120 INFO MainThread:234834 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-07 10:34:50,121 INFO MainThread:234834 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug.log +2026-02-07 10:34:50,121 INFO MainThread:234834 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/logs/debug-internal.log +2026-02-07 10:34:50,121 INFO MainThread:234834 [wandb_init.py:init():844] calling init triggers +2026-02-07 10:34:50,123 INFO MainThread:234834 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-07 10:34:50,123 INFO MainThread:234834 [wandb_init.py:init():892] starting backend +2026-02-07 10:34:50,412 INFO MainThread:234834 [wandb_init.py:init():895] sending inform_init request +2026-02-07 10:34:50,421 INFO MainThread:234834 [wandb_init.py:init():903] backend started and connected +2026-02-07 10:34:50,436 INFO MainThread:234834 [wandb_init.py:init():973] updated telemetry +2026-02-07 10:34:50,460 INFO MainThread:234834 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-07 10:34:51,583 INFO MainThread:234834 [wandb_init.py:init():1042] starting run threads in backend +2026-02-07 10:34:52,343 INFO MainThread:234834 [wandb_run.py:_console_start():2529] atexit reg +2026-02-07 10:34:52,343 INFO MainThread:234834 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-07 10:34:52,343 INFO MainThread:234834 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-07 10:34:52,343 INFO MainThread:234834 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-07 10:34:52,356 INFO MainThread:234834 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d78af0c289eb892ae52257239a41a200ae2d6cd8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4d684cb4e0fd3db8541a1713504acacaaca9bf44 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-07T16:00:43.122673Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=43297", + "--object-store-name=/tmp/ray/session_2026-02-07_10-53-44_882903_374056/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-07_10-53-44_882903_374056/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=53255", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=64878", + "--gcs-address=172.16.34.29:65048", + "--session-name=session_2026-02-07_10-53-44_882903_374056", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8297", + "--cluster-id=45af7cb19e83f65af29f825c76037f19f111e73b255f6821957a9742", + "--startup-token=128", + "--worker-launch-time-ms=1770479641843", + "--node-id=8f30099b7dfadcd6cb48455a01a58a59d61a74566e43759f39d48ec5", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "188198088704" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "4bco19eqiqqcgqtfc6fjxcmnw9htd5ev" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..df418e460a991959868709428088ad4948a7a21d --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":258,"_wandb":{"runtime":258}} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..7c56c995ffe0a38e0c3cc914bd9097e1012fc2d7 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug-core.log @@ -0,0 +1,8 @@ +{"time":"2026-02-07T11:00:43.279763084-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpjnpsrfuy/port-391957.txt","pid":391957,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-07T11:00:43.282615617-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":391957} +{"time":"2026-02-07T11:00:43.282602902-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-391957-425242-629764596/socket","Net":"unix"}} +{"time":"2026-02-07T11:00:43.439644719-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-07T11:00:43.454683954-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"guu1a2ui","id":"1(@)"} +{"time":"2026-02-07T11:00:45.188214759-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"guu1a2ui","id":"1(@)"} +{"time":"2026-02-07T11:00:51.484496051-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"awrfwfzlbciu"} +{"time":"2026-02-07T11:05:04.093068084-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"awrfwfzlbciu"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d73333ee1a016e8fa2ca38b7cefd33eddb68da67 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-07T11:00:43.455983119-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-07T11:00:45.185665869-05:00","level":"INFO","msg":"stream: created new stream","id":"guu1a2ui"} +{"time":"2026-02-07T11:00:45.185857972-05:00","level":"INFO","msg":"handler: started","stream_id":"guu1a2ui"} +{"time":"2026-02-07T11:00:45.188178423-05:00","level":"INFO","msg":"stream: started","id":"guu1a2ui"} +{"time":"2026-02-07T11:00:45.18824121-05:00","level":"INFO","msg":"sender: started","stream_id":"guu1a2ui"} +{"time":"2026-02-07T11:00:45.188252559-05:00","level":"INFO","msg":"writer: started","stream_id":"guu1a2ui"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..06d4dd9903f7dd6d2833118cd6da669556f0aea0 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug.log @@ -0,0 +1,23 @@ +2026-02-07 11:00:43,143 INFO MainThread:391957 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-07 11:00:43,144 INFO MainThread:391957 [wandb_setup.py:_flush():81] Configure stats pid to 391957 +2026-02-07 11:00:43,144 INFO MainThread:391957 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-07 11:00:43,144 INFO MainThread:391957 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug.log +2026-02-07 11:00:43,144 INFO MainThread:391957 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_110043-guu1a2ui/logs/debug-internal.log +2026-02-07 11:00:43,144 INFO MainThread:391957 [wandb_init.py:init():844] calling init triggers +2026-02-07 11:00:43,146 INFO MainThread:391957 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-07 11:00:43,146 INFO MainThread:391957 [wandb_init.py:init():892] starting backend +2026-02-07 11:00:43,440 INFO MainThread:391957 [wandb_init.py:init():895] sending inform_init request +2026-02-07 11:00:43,449 INFO MainThread:391957 [wandb_init.py:init():903] backend started and connected +2026-02-07 11:00:43,461 INFO MainThread:391957 [wandb_init.py:init():973] updated telemetry +2026-02-07 11:00:43,485 INFO MainThread:391957 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-07 11:00:45,512 INFO MainThread:391957 [wandb_init.py:init():1042] starting run threads in backend +2026-02-07 11:00:46,454 INFO MainThread:391957 [wandb_run.py:_console_start():2529] atexit reg +2026-02-07 11:00:46,455 INFO MainThread:391957 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-07 11:00:46,455 INFO MainThread:391957 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-07 11:00:46,456 INFO MainThread:391957 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-07 11:00:46,470 INFO MainThread:391957 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-07 11:05:04,088 INFO MainThread:391957 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/guu1a2ui +2026-02-07 11:05:04,090 INFO MainThread:391957 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-07 11:05:04,091 INFO MainThread:391957 [wandb_run.py:_restore():2476] restore +2026-02-07 11:05:04,092 INFO MainThread:391957 [wandb_run.py:_restore():2482] restore done diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d78af0c289eb892ae52257239a41a200ae2d6cd8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e73d2539857a19b6106780c6c8e8819bd013c3b2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-07T16:18:40.056002Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=33377", + "--object-store-name=/tmp/ray/session_2026-02-07_11-11-51_145730_485767/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-07_11-11-51_145730_485767/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=61864", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=38528", + "--gcs-address=172.16.34.29:64074", + "--session-name=session_2026-02-07_11-11-51_145730_485767", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8297", + "--cluster-id=b9c6d8260ff7765ecb32ce5e2f2a6f96c20bb8e9e690a94355ef429e", + "--startup-token=128", + "--worker-launch-time-ms=1770480725136", + "--node-id=1722aff056e4c15ae7bfdb475e188eccdfc4f0fb5498c83483b19c28", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "188225777664" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "88p3jnmc3bcmts4akvh8cpqwqwpmfzgt" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..5133d53a4b4e3b5cce309ffedbbffbc583d6dd14 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-07T11:18:40.230438437-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpm8cmdcpl/port-494494.txt","pid":494494,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-07T11:18:40.231936108-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":494494} +{"time":"2026-02-07T11:18:40.231930554-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-494494-521925-457192743/socket","Net":"unix"}} +{"time":"2026-02-07T11:18:40.387955627-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-07T11:18:40.40286835-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"r3wx8thu","id":"1(@)"} +{"time":"2026-02-07T11:18:41.641271581-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"r3wx8thu","id":"1(@)"} +{"time":"2026-02-07T11:18:47.785157372-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"q1cc73tmhltw"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..699315f1b00bc19cd1b4c32f78d48476349d4f24 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-07T11:18:40.40454051-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-07T11:18:41.638510241-05:00","level":"INFO","msg":"stream: created new stream","id":"r3wx8thu"} +{"time":"2026-02-07T11:18:41.638679539-05:00","level":"INFO","msg":"handler: started","stream_id":"r3wx8thu"} +{"time":"2026-02-07T11:18:41.641242172-05:00","level":"INFO","msg":"stream: started","id":"r3wx8thu"} +{"time":"2026-02-07T11:18:41.641416143-05:00","level":"INFO","msg":"writer: started","stream_id":"r3wx8thu"} +{"time":"2026-02-07T11:18:41.641421939-05:00","level":"INFO","msg":"sender: started","stream_id":"r3wx8thu"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..6c0eb6969363c42037a4639b86860e747238b620 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-07 11:18:40,082 INFO MainThread:494494 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-07 11:18:40,082 INFO MainThread:494494 [wandb_setup.py:_flush():81] Configure stats pid to 494494 +2026-02-07 11:18:40,083 INFO MainThread:494494 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-07 11:18:40,083 INFO MainThread:494494 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug.log +2026-02-07 11:18:40,083 INFO MainThread:494494 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_111840-r3wx8thu/logs/debug-internal.log +2026-02-07 11:18:40,084 INFO MainThread:494494 [wandb_init.py:init():844] calling init triggers +2026-02-07 11:18:40,085 INFO MainThread:494494 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 6045, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 4, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 2, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 2, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 8, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 6045, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 4, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 1, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-07 11:18:40,085 INFO MainThread:494494 [wandb_init.py:init():892] starting backend +2026-02-07 11:18:40,388 INFO MainThread:494494 [wandb_init.py:init():895] sending inform_init request +2026-02-07 11:18:40,397 INFO MainThread:494494 [wandb_init.py:init():903] backend started and connected +2026-02-07 11:18:40,411 INFO MainThread:494494 [wandb_init.py:init():973] updated telemetry +2026-02-07 11:18:40,474 INFO MainThread:494494 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-07 11:18:41,922 INFO MainThread:494494 [wandb_init.py:init():1042] starting run threads in backend +2026-02-07 11:18:42,759 INFO MainThread:494494 [wandb_run.py:_console_start():2529] atexit reg +2026-02-07 11:18:42,759 INFO MainThread:494494 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-07 11:18:42,760 INFO MainThread:494494 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-07 11:18:42,760 INFO MainThread:494494 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-07 11:18:42,773 INFO MainThread:494494 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d78af0c289eb892ae52257239a41a200ae2d6cd8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e37ed81dbf43490c68dcddbcc97e81a0e92a0b1c --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-07T16:30:41.971066Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=38731", + "--object-store-name=/tmp/ray/session_2026-02-07_11-23-21_176789_560668/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-07_11-23-21_176789_560668/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=57787", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=61716", + "--gcs-address=172.16.34.29:60023", + "--session-name=session_2026-02-07_11-23-21_176789_560668", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8297", + "--cluster-id=370a1f724330a4aafe3fbd01e4e14a241970e55ca223570d10827c4e", + "--startup-token=128", + "--worker-launch-time-ms=1770481416707", + "--node-id=3167b7b31cc1bbafa7cd7475e85c6e83f52bf93321eadd37e0235a0a", + "--runtime-env-hash=-2086329310" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "188255096832" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "zs579pc3uw4lpzesyrt3i5jvsf78epwk" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..1d02d5242e21387cce05abcfb67aa63854f41379 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2026-02-07T11:30:42.133472663-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpxfufmriw/port-572506.txt","pid":572506,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-07T11:30:42.134859961-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":572506} +{"time":"2026-02-07T11:30:42.134811243-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-572506-600660-515315493/socket","Net":"unix"}} +{"time":"2026-02-07T11:30:42.295094179-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-07T11:30:42.309590652-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"bhf8tuxa","id":"1(@)"} +{"time":"2026-02-07T11:30:43.80865873-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"bhf8tuxa","id":"1(@)"} +{"time":"2026-02-07T11:30:50.138881667-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"q0s6lmd8t1ko"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..cb6e8886d491345e08232af5c0d7b703fa4575fe --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-07T11:30:42.310964076-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-07T11:30:43.754825568-05:00","level":"INFO","msg":"stream: created new stream","id":"bhf8tuxa"} +{"time":"2026-02-07T11:30:43.755004364-05:00","level":"INFO","msg":"handler: started","stream_id":"bhf8tuxa"} +{"time":"2026-02-07T11:30:43.808608409-05:00","level":"INFO","msg":"stream: started","id":"bhf8tuxa"} +{"time":"2026-02-07T11:30:43.80864944-05:00","level":"INFO","msg":"writer: started","stream_id":"bhf8tuxa"} +{"time":"2026-02-07T11:30:43.80868108-05:00","level":"INFO","msg":"sender: started","stream_id":"bhf8tuxa"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4c9ceb586a4aa3f2f0ec8463aea5cd731ed66a59 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug.log @@ -0,0 +1,21 @@ +2026-02-07 11:30:41,993 INFO MainThread:572506 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-07 11:30:41,993 INFO MainThread:572506 [wandb_setup.py:_flush():81] Configure stats pid to 572506 +2026-02-07 11:30:41,993 INFO MainThread:572506 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-07 11:30:41,994 INFO MainThread:572506 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug.log +2026-02-07 11:30:41,994 INFO MainThread:572506 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/logs/debug-internal.log +2026-02-07 11:30:41,994 INFO MainThread:572506 [wandb_init.py:init():844] calling init triggers +2026-02-07 11:30:41,996 INFO MainThread:572506 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 6045, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 4, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 2, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 2, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 8, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 6045, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 4, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 1, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-07 11:30:41,997 INFO MainThread:572506 [wandb_init.py:init():892] starting backend +2026-02-07 11:30:42,295 INFO MainThread:572506 [wandb_init.py:init():895] sending inform_init request +2026-02-07 11:30:42,304 INFO MainThread:572506 [wandb_init.py:init():903] backend started and connected +2026-02-07 11:30:42,318 INFO MainThread:572506 [wandb_init.py:init():973] updated telemetry +2026-02-07 11:30:42,346 INFO MainThread:572506 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-07 11:30:44,107 INFO MainThread:572506 [wandb_init.py:init():1042] starting run threads in backend +2026-02-07 11:30:45,108 INFO MainThread:572506 [wandb_run.py:_console_start():2529] atexit reg +2026-02-07 11:30:45,109 INFO MainThread:572506 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-07 11:30:45,109 INFO MainThread:572506 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-07 11:30:45,109 INFO MainThread:572506 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-07 11:30:45,121 INFO MainThread:572506 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-07 12:19:31,904 INFO wandb-AsyncioManager-main:572506 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-07 12:19:31,914 INFO wandb-AsyncioManager-main:572506 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles. diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d78af0c289eb892ae52257239a41a200ae2d6cd8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..50eb1b26b7ac445b86f4d41e8f002c070f0b24a6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-07T17:26:07.697458Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=39565", + "--object-store-name=/tmp/ray/session_2026-02-07_12-20-13_689088_750591/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-07_12-20-13_689088_750591/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=55729", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=64966", + "--gcs-address=172.16.34.29:59877", + "--session-name=session_2026-02-07_12-20-13_689088_750591", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8297", + "--cluster-id=de0d59f03571a90ef7c45248c40006861cdf675236b4d421c9d82ff5", + "--startup-token=128", + "--worker-launch-time-ms=1770484828189", + "--node-id=bfe68127d86b81da9c46ebcb76220fe364225b2883247c2544a9a01d", + "--runtime-env-hash=-2086329310" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "188373913600" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "qzshr3cxtxqmv96cwqiw3741i4bjegop" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ec6d562821d7b3e629a4dc356aaa6b9124950a46 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":1223},"_runtime":1223} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ee346208ba57b683b1615796dd8f5cbb2a7e2f64 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug-core.log @@ -0,0 +1,8 @@ +{"time":"2026-02-07T12:26:07.836102374-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp1896t4x6/port-759076.txt","pid":759076,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-07T12:26:07.837420986-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":759076} +{"time":"2026-02-07T12:26:07.83733235-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-759076-769345-1309437185/socket","Net":"unix"}} +{"time":"2026-02-07T12:26:08.006628386-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-07T12:26:08.020850136-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"4jfbiq6q","id":"1(@)"} +{"time":"2026-02-07T12:26:09.659024492-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"4jfbiq6q","id":"1(@)"} +{"time":"2026-02-07T12:26:16.138163252-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"dv9r2vfji77v"} +{"time":"2026-02-07T12:46:33.378450347-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"dv9r2vfji77v"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5f1c4b44a01bc744776a5f6cdc4ce2d39dee3bf8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-07T12:26:08.021962867-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-07T12:26:09.656270809-05:00","level":"INFO","msg":"stream: created new stream","id":"4jfbiq6q"} +{"time":"2026-02-07T12:26:09.656429545-05:00","level":"INFO","msg":"handler: started","stream_id":"4jfbiq6q"} +{"time":"2026-02-07T12:26:09.65899053-05:00","level":"INFO","msg":"stream: started","id":"4jfbiq6q"} +{"time":"2026-02-07T12:26:09.659226983-05:00","level":"INFO","msg":"writer: started","stream_id":"4jfbiq6q"} +{"time":"2026-02-07T12:26:09.659263272-05:00","level":"INFO","msg":"sender: started","stream_id":"4jfbiq6q"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..550bd97b81b4f961b6ab0608e6c312df0119598b --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug.log @@ -0,0 +1,23 @@ +2026-02-07 12:26:07,717 INFO MainThread:759076 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-07 12:26:07,718 INFO MainThread:759076 [wandb_setup.py:_flush():81] Configure stats pid to 759076 +2026-02-07 12:26:07,718 INFO MainThread:759076 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-07 12:26:07,718 INFO MainThread:759076 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug.log +2026-02-07 12:26:07,718 INFO MainThread:759076 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/logs/debug-internal.log +2026-02-07 12:26:07,719 INFO MainThread:759076 [wandb_init.py:init():844] calling init triggers +2026-02-07 12:26:07,720 INFO MainThread:759076 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 6045, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 4, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 2, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 2, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 8, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 6045, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 4, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 1, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-07 12:26:07,720 INFO MainThread:759076 [wandb_init.py:init():892] starting backend +2026-02-07 12:26:08,007 INFO MainThread:759076 [wandb_init.py:init():895] sending inform_init request +2026-02-07 12:26:08,018 INFO MainThread:759076 [wandb_init.py:init():903] backend started and connected +2026-02-07 12:26:08,029 INFO MainThread:759076 [wandb_init.py:init():973] updated telemetry +2026-02-07 12:26:08,052 INFO MainThread:759076 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-07 12:26:09,897 INFO MainThread:759076 [wandb_init.py:init():1042] starting run threads in backend +2026-02-07 12:26:11,112 INFO MainThread:759076 [wandb_run.py:_console_start():2529] atexit reg +2026-02-07 12:26:11,114 INFO MainThread:759076 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-07 12:26:11,115 INFO MainThread:759076 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-07 12:26:11,115 INFO MainThread:759076 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-07 12:26:11,127 INFO MainThread:759076 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-07 12:46:33,375 INFO MainThread:759076 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/4jfbiq6q +2026-02-07 12:46:33,376 INFO MainThread:759076 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-07 12:46:33,377 INFO MainThread:759076 [wandb_run.py:_restore():2476] restore +2026-02-07 12:46:33,377 INFO MainThread:759076 [wandb_run.py:_restore():2482] restore done diff --git a/code/RL_model/verl/verl_train/wandb/run-20260207_134018-vq0iy4i3/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260207_134018-vq0iy4i3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4d4500c4494be5ebb4275048fa0eb1f31b675ed8 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260207_134018-vq0iy4i3/files/output.log @@ -0,0 +1,14 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(-5.449668711774489), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(-5.449668725530857), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(-5.449668711774489) - val-core/multiclinsum/acc/mean@1:np.float64(-5.449668725530857) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/90 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-09T15:16:49.160385314-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/1bt9yf1w/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260209_134931-1bt9yf1w/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260209_134931-1bt9yf1w/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..72d85dbd24168bfa806ea75fac99f7383aa132ef --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260209_134931-1bt9yf1w/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-09 13:49:31,410 INFO MainThread:2582064 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-09 13:49:31,411 INFO MainThread:2582064 [wandb_setup.py:_flush():81] Configure stats pid to 2582064 +2026-02-09 13:49:31,411 INFO MainThread:2582064 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-09 13:49:31,411 INFO MainThread:2582064 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260209_134931-1bt9yf1w/logs/debug.log +2026-02-09 13:49:31,411 INFO MainThread:2582064 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260209_134931-1bt9yf1w/logs/debug-internal.log +2026-02-09 13:49:31,411 INFO MainThread:2582064 [wandb_init.py:init():844] calling init triggers +2026-02-09 13:49:31,413 INFO MainThread:2582064 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-en', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 5, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 10, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-09 13:49:31,413 INFO MainThread:2582064 [wandb_init.py:init():892] starting backend +2026-02-09 13:49:31,734 INFO MainThread:2582064 [wandb_init.py:init():895] sending inform_init request +2026-02-09 13:49:31,746 INFO MainThread:2582064 [wandb_init.py:init():903] backend started and connected +2026-02-09 13:49:31,764 INFO MainThread:2582064 [wandb_init.py:init():973] updated telemetry +2026-02-09 13:49:31,790 INFO MainThread:2582064 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-09 13:49:33,768 INFO MainThread:2582064 [wandb_init.py:init():1042] starting run threads in backend +2026-02-09 13:49:34,669 INFO MainThread:2582064 [wandb_run.py:_console_start():2529] atexit reg +2026-02-09 13:49:34,669 INFO MainThread:2582064 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-09 13:49:34,670 INFO MainThread:2582064 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-09 13:49:34,670 INFO MainThread:2582064 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-09 13:49:34,683 INFO MainThread:2582064 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1363cb81f80a8d0bd6d71ec1070f9a51abcbc7eb --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [dspy, litellm, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd2de633cd2e89d34a77b40967a7fc9ed3177c34 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/requirements.txt @@ -0,0 +1,283 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +gepa==0.0.26 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +litellm==1.81.9 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +openai==2.17.0 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +asyncer==0.0.8 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +alembic==1.18.3 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +Mako==1.3.10 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +optuna==4.7.0 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +tenacity==9.1.4 +compressed-tensors==0.11.0 +SQLAlchemy==2.0.46 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +colorlog==6.10.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +json_repair==0.57.1 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +dspy==3.1.3 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +fastuuid==0.14.0 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +greenlet==3.3.1 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b393367930f475c7911a90aa10198388f8bf25ce --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-10T05:05:42.731761Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=39329", + "--object-store-name=/tmp/ray/session_2026-02-09_23-59-33_106640_3780904/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-09_23-59-33_106640_3780904/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=61322", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=64277", + "--gcs-address=172.16.34.29:61670", + "--session-name=session_2026-02-09_23-59-33_106640_3780904", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8297", + "--cluster-id=f3d4f943538bba80589df6297cde07197630716a63e13a8e2b80f5d7", + "--startup-token=128", + "--worker-launch-time-ms=1770699585639", + "--node-id=df798a3a0fabc60d5df71d3c6e276eebc0a0a39646cff7b49cfc7423", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "191242661888" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "gc5oti9ohsskplcsqtnkl5dfkfh06kju" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0fa02652fd51f44042f2441e36c10a2dbe3b1411 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":577,"_wandb":{"runtime":577}} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..d5b885a47eb034b0592ec229bf55244a06178349 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug-core.log @@ -0,0 +1,8 @@ +{"time":"2026-02-10T00:05:42.88556399-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_998e929/port-3789424.txt","pid":3789424,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-10T00:05:42.886575339-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":3789424} +{"time":"2026-02-10T00:05:42.886570624-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-3789424-3799227-3397557547/socket","Net":"unix"}} +{"time":"2026-02-10T00:05:43.043301577-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-10T00:05:43.055063051-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"03xcpt7l","id":"1(@)"} +{"time":"2026-02-10T00:05:44.675181561-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"03xcpt7l","id":"1(@)"} +{"time":"2026-02-10T00:05:50.862144117-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"x7xtdc0sam17"} +{"time":"2026-02-10T00:15:22.684860046-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"x7xtdc0sam17"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..87c54a2e32518f3ea0aa5a7bf5155a9f3dd66911 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-10T00:05:43.05661957-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-10T00:05:44.671346502-05:00","level":"INFO","msg":"stream: created new stream","id":"03xcpt7l"} +{"time":"2026-02-10T00:05:44.672455175-05:00","level":"INFO","msg":"handler: started","stream_id":"03xcpt7l"} +{"time":"2026-02-10T00:05:44.675146416-05:00","level":"INFO","msg":"stream: started","id":"03xcpt7l"} +{"time":"2026-02-10T00:05:44.675188611-05:00","level":"INFO","msg":"sender: started","stream_id":"03xcpt7l"} +{"time":"2026-02-10T00:05:44.675192997-05:00","level":"INFO","msg":"writer: started","stream_id":"03xcpt7l"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..078e4aa62d03ca59be227a0a321dc6a1e4b37eb2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug.log @@ -0,0 +1,27 @@ +2026-02-10 00:05:42,748 INFO MainThread:3789424 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-10 00:05:42,749 INFO MainThread:3789424 [wandb_setup.py:_flush():81] Configure stats pid to 3789424 +2026-02-10 00:05:42,749 INFO MainThread:3789424 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-10 00:05:42,749 INFO MainThread:3789424 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug.log +2026-02-10 00:05:42,749 INFO MainThread:3789424 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260210_000542-03xcpt7l/logs/debug-internal.log +2026-02-10 00:05:42,749 INFO MainThread:3789424 [wandb_init.py:init():844] calling init triggers +2026-02-10 00:05:42,751 INFO MainThread:3789424 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-en', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 5, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 10, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-10 00:05:42,751 INFO MainThread:3789424 [wandb_init.py:init():892] starting backend +2026-02-10 00:05:43,043 INFO MainThread:3789424 [wandb_init.py:init():895] sending inform_init request +2026-02-10 00:05:43,049 INFO MainThread:3789424 [wandb_init.py:init():903] backend started and connected +2026-02-10 00:05:43,057 INFO MainThread:3789424 [wandb_init.py:init():973] updated telemetry +2026-02-10 00:05:43,076 INFO MainThread:3789424 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-10 00:05:45,011 INFO MainThread:3789424 [wandb_init.py:init():1042] starting run threads in backend +2026-02-10 00:05:45,832 INFO MainThread:3789424 [wandb_run.py:_console_start():2529] atexit reg +2026-02-10 00:05:45,832 INFO MainThread:3789424 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-10 00:05:45,832 INFO MainThread:3789424 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-10 00:05:45,833 INFO MainThread:3789424 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-10 00:05:45,846 INFO MainThread:3789424 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-10 00:15:22,681 INFO MainThread:3789424 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/03xcpt7l +2026-02-10 00:15:22,683 INFO MainThread:3789424 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-10 00:15:22,684 INFO MainThread:3789424 [wandb_run.py:_restore():2476] restore +2026-02-10 00:15:22,684 INFO MainThread:3789424 [wandb_run.py:_restore():2482] restore done +2026-02-10 00:15:22,868 INFO wandb-AsyncioManager-main:3789424 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-10 00:15:22,869 INFO wandb-AsyncioManager-main:3789424 [mailbox.py:close():154] Closing mailbox, abandoning 2 handles. +2026-02-10 00:15:22,870 INFO MainThread:3789424 [wandb_run.py:_restore():2476] restore +2026-02-10 00:15:22,870 INFO MainThread:3789424 [wandb_run.py:_restore():2482] restore done diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_002512-y8zrft04/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260210_002512-y8zrft04/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..91d98a525ec841645630e1c56c8129ef38be557d --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_002512-y8zrft04/files/config.yaml @@ -0,0 +1,819 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + b67cw9b3932w1ydhiiskcdzjmplv9v6z: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=41799 + - --object-store-name=/tmp/ray/session_2026-02-10_00-19-11_291622_3842721/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-10_00-19-11_291622_3842721/sockets/raylet + - --redis-address=None + - --metrics-agent-port=51842 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=52235 + - --gcs-address=172.16.34.29:58975 + - --session-name=session_2026-02-10_00-19-11_291622_3842721 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8297 + - --cluster-id=dc298598f37a0d21538dce20236cc3100f2fd8ec5710337dfdeabc00 + - --startup-token=128 + - --worker-launch-time-ms=1770700763686 + - --node-id=057bfcf3bc01161d491e5fe6ef31ed0729d687cd30c5819b159d5cf2 + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "191265169408" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-10T05:25:12.620990Z" + writerId: b67cw9b3932w1ydhiiskcdzjmplv9v6z + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 35 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + - 107 + "2": + - 1 + - 11 + - 30 + - 35 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + - 107 + "3": + - 2 + - 13 + - 16 + - 61 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: false + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.6 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: 8192 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 3 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 1024 + quantization: null + quantization_config_file: null + response_length: 2048 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 1024 + max_response_length: 2048 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 512 + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-en + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + remove_previous_ckpt_in_save: true + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 5 + test_freq: 10 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_002512-y8zrft04/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260210_002512-y8zrft04/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a561ec1558cd07387a1606fafd18f7d663606cd6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_002512-y8zrft04/files/output.log @@ -0,0 +1,14 @@ +wandb: Detected [dspy, litellm, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/train_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(0.6505448959575766), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(0.6505449011414762), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(0.6505448959575766) - val-core/multiclinsum/acc/mean@1:np.float64(0.6505449011414762) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/90 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-10T20:12:10.494276292-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/1211jgw0/file_stream\": read tcp 172.16.34.29:44914->35.186.228.49:443: read: connection reset by peer"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260210_131724-1211jgw0/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260210_131724-1211jgw0/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7d61d5e78ca4e16588c1dddd1cb65d60024ad764 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260210_131724-1211jgw0/logs/debug.log @@ -0,0 +1,23 @@ +2026-02-10 13:17:24,414 INFO MainThread:915293 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-10 13:17:24,414 INFO MainThread:915293 [wandb_setup.py:_flush():81] Configure stats pid to 915293 +2026-02-10 13:17:24,415 INFO MainThread:915293 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-10 13:17:24,415 INFO MainThread:915293 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260210_131724-1211jgw0/logs/debug.log +2026-02-10 13:17:24,415 INFO MainThread:915293 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260210_131724-1211jgw0/logs/debug-internal.log +2026-02-10 13:17:24,415 INFO MainThread:915293 [wandb_init.py:init():844] calling init triggers +2026-02-10 13:17:24,416 INFO MainThread:915293 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'optimizer_offload': True, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 16, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-en', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 5, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 10, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/train_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-10 13:17:24,416 INFO MainThread:915293 [wandb_init.py:init():892] starting backend +2026-02-10 13:17:24,704 INFO MainThread:915293 [wandb_init.py:init():895] sending inform_init request +2026-02-10 13:17:24,713 INFO MainThread:915293 [wandb_init.py:init():903] backend started and connected +2026-02-10 13:17:24,724 INFO MainThread:915293 [wandb_init.py:init():973] updated telemetry +2026-02-10 13:17:24,746 INFO MainThread:915293 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-10 13:17:25,466 INFO MainThread:915293 [wandb_init.py:init():1042] starting run threads in backend +2026-02-10 13:17:26,155 INFO MainThread:915293 [wandb_run.py:_console_start():2529] atexit reg +2026-02-10 13:17:26,155 INFO MainThread:915293 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-10 13:17:26,155 INFO MainThread:915293 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-10 13:17:26,156 INFO MainThread:915293 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-10 13:17:26,164 INFO MainThread:915293 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-11 17:39:14,948 INFO MainThread:915293 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/1211jgw0 +2026-02-11 17:39:14,950 INFO MainThread:915293 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-11 17:39:14,951 INFO MainThread:915293 [wandb_run.py:_restore():2476] restore +2026-02-11 17:39:14,951 INFO MainThread:915293 [wandb_run.py:_restore():2482] restore done diff --git a/code/RL_model/verl/verl_train/wandb/run-20260211_181504-2bnxrv8i/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260211_181504-2bnxrv8i/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d6e17514f9e094c5b0f24b4c21d48b02131d1d93 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260211_181504-2bnxrv8i/files/output.log @@ -0,0 +1,14 @@ +wandb: Detected [dspy, litellm, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(0.3417882988526358), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(0.34178829897157226), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(0.3417882988526358) - val-core/multiclinsum/acc/mean@1:np.float64(0.34178829897157226) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/45 [00:00