diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..5b71a077ac31fdb8f8c655c15c593ead6ef93efe
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,101 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+data/old/testing_data/old/multiclinsum_test_en.json filter=lfs diff=lfs merge=lfs -text
+data/old/testing_data/old/multiclinsum_test_pt.json filter=lfs diff=lfs merge=lfs -text
+data/old/testing_data/old/multiclinsum_test_fr.json filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/data/cifar-10-batches-py/data_batch_4 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/data/cifar-10-batches-py/data_batch_1 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/data/cifar-10-batches-py/data_batch_2 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/data/cifar-10-batches-py/data_batch_5 filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260206_190357-cyijm662/run-cyijm662.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260215_114517-4c5nwk6l/run-4c5nwk6l.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260213_012459-7qz9wu2i/run-7qz9wu2i.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260202_092950-nfoupjps/run-nfoupjps.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260215_022720-l2pbuwit/run-l2pbuwit.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260211_183524-38mthb4f/run-38mthb4f.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260211_181504-2bnxrv8i/run-2bnxrv8i.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260213_024109-70p0ly3w/run-70p0ly3w.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260207_134018-vq0iy4i3/run-vq0iy4i3.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260210_002512-y8zrft04/run-y8zrft04.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260207_122607-4jfbiq6q/run-4jfbiq6q.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260202_095227-bx2ydf22/run-bx2ydf22.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260201_222949-yk5vgzhp/run-yk5vgzhp.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260211_190231-cje0bmdl/run-cje0bmdl.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260202_004649-iczy37hv/run-iczy37hv.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260213_215553-1w3n5xgv/run-1w3n5xgv.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260201_232745-x2j8bpwi/run-x2j8bpwi.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260207_103450-gjiqvndf/run-gjiqvndf.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260215_041259-udcrfv6m/run-udcrfv6m.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260213_213805-359jnobz/run-359jnobz.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260206_205901-0ndh0r3l/run-0ndh0r3l.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260210_104801-4ptnl9ej/run-4ptnl9ej.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260207_113041-bhf8tuxa/run-bhf8tuxa.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260209_134931-1bt9yf1w/run-1bt9yf1w.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260202_011021-xbya534l/run-xbya534l.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/verl_train/wandb/run-20260210_131724-1211jgw0/run-1211jgw0.wandb filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/Search-R1/misc/public/head.png filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/Search-R1/misc/public/llama32-3b.png filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/Search-R1/misc/public/main.png filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/Search-R1/misc/public/single-turn.png filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/Search-R1/misc/public/multi-turn.png filter=lfs diff=lfs merge=lfs -text
+code/RL_model/verl/Search-R1/misc/public/logo.png filter=lfs diff=lfs merge=lfs -text
+code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+code/RL_model/models/converted_model/v1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json filter=lfs diff=lfs merge=lfs -text
+data/processed_test_raw_data/multiclinsum_test_pt.json filter=lfs diff=lfs merge=lfs -text
+data/processed_test_raw_data/multiclinsum_test_fr.json filter=lfs diff=lfs merge=lfs -text
+data/processed_test_raw_data/multiclinsum_test_en.json filter=lfs diff=lfs merge=lfs -text
+data/processed_test_raw_data/multiclinsum_test_es.json filter=lfs diff=lfs merge=lfs -text
+data/vector_db/db_v1/en/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+data/vector_db/db_v1/es/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+data/extracting_subclaim/old/extracted_subclaims_full_data_es.json filter=lfs diff=lfs merge=lfs -text
+data/vector_db/db_v1/pt/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+data/old/testing_data/es_testing_data.json filter=lfs diff=lfs merge=lfs -text
+data/old/testing_data/old/multiclinsum_test_es.json filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/results/misclassified_examples_test.png filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/results/misclassified_examples_pretrained_vit.png filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/data/cifar-10-batches-py/data_batch_3 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/data/cifar-10-batches-py/test_batch filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/data/cifar-10-batches-py/data_batch_4 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/data/cifar-10-batches-py/data_batch_1 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/data/cifar-10-batches-py/data_batch_2 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/assignment_image/data/cifar-10-batches-py/data_batch_5 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/data/cifar-10-batches-py/data_batch_3 filter=lfs diff=lfs merge=lfs -text
+assignment_llm_1/data/cifar-10-batches-py/test_batch filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..01d586e8907425e6ccaeee8f186045a8f5dd1afe
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+code/RL_model/models/
+code/fine_tune_sft_dpo/model/
+code/RL_model/verl/verl_train/dataset/
diff --git a/.gradio/certificate.pem b/.gradio/certificate.pem
new file mode 100644
index 0000000000000000000000000000000000000000..b85c8037f6b60976b2546fdbae88312c5246d9a3
--- /dev/null
+++ b/.gradio/certificate.pem
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
diff --git a/code/RL_model/inference_data/old/RL_model_inference_v1.jsonl b/code/RL_model/inference_data/old/RL_model_inference_v1.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d24003995404c6f8e0505287983e637d1e59305b
--- /dev/null
+++ b/code/RL_model/inference_data/old/RL_model_inference_v1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a497473f77837734b1dd62cea949e2ddfea515734ed19dca00d977f90c16ab5
+size 835221
diff --git a/code/RL_model/inference_data/old/inference_20260213_002423.jsonl b/code/RL_model/inference_data/old/inference_20260213_002423.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ac0b50580da6fb461f75d63fa911d224a4a058db
--- /dev/null
+++ b/code/RL_model/inference_data/old/inference_20260213_002423.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d1d8c37ad6d849a50ab46d493d10ed22bd1a7c3955ecf1f0b7e9fb0a4b408a9
+size 2439
diff --git a/code/RL_model/inference_data/old/vllm_inference_20260213_003845.jsonl b/code/RL_model/inference_data/old/vllm_inference_20260213_003845.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1420be3102a04eb8244703fef94620319b4c026d
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_20260213_003845.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe88b4a7955841cfb2caf3bcc97ec954bde9f7a71b85b8e9efa30efe6af8da68
+size 804503
diff --git a/code/RL_model/inference_data/old/vllm_inference_20260213_003845.parquet b/code/RL_model/inference_data/old/vllm_inference_20260213_003845.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cbe602740c6f42f9273c0411c3ed0724d93102d8
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_20260213_003845.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6169f9e6f4fea6b70fc6733d918ac2cb052a64119be20c4af1eed284cb6edbeb
+size 411508
diff --git a/code/RL_model/inference_data/old/vllm_inference_20260213_003845_meta.json b/code/RL_model/inference_data/old/vllm_inference_20260213_003845_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ce422231721b0f5c96728c034b31d4b2ac7d91e
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_20260213_003845_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee8fba13765cfe3b4286ace8d256e73b881b37acbc6f11da5df6be2ca61142da
+size 537
diff --git a/code/RL_model/inference_data/old/vllm_inference_20260213_165923.jsonl b/code/RL_model/inference_data/old/vllm_inference_20260213_165923.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b56277985041cddb744db51ae891065f1b544398
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_20260213_165923.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35f8552332ac68f09c962a525961198b17364cada04e0c75c6563657c6f62f12
+size 481610
diff --git a/code/RL_model/inference_data/old/vllm_inference_20260213_170937.parquet b/code/RL_model/inference_data/old/vllm_inference_20260213_170937.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e882d5eef25c90cf3cb3d1254eea6238e158ffc9
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_20260213_170937.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3819782525c740c8bdd4b1071ca8c869f7ebe198438385ee101e50e705a8fae1
+size 462627
diff --git a/code/RL_model/inference_data/old/vllm_inference_20260213_170937_meta.json b/code/RL_model/inference_data/old/vllm_inference_20260213_170937_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa17c3df7cefea25dbad2c09e088128442dbf24b
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_20260213_170937_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcfa23d1b08bf31f701e04301ce9c3c8132a2b584b98aa0c73b99536c4faccd0
+size 556
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c658abe6f752d9f4b914323482a874d14194dbca
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd493befed37a97951291f727d6f6a9e74f884dc1a17ae15dbaaf0652b56d570
+size 628725
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..6273dbea896d3b45ef48d11bd5987a9c48e21435
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b6012ec2e2089bf304656397e37f7e4ecd96206c00876a15d0e488c9c50f48
+size 339413
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..33ad11402613a3607499e09ab23735f077b04413
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_180009_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8d4f6245af26eb17338a50fbdc677dff02879f2f3a357cbc247e69b796f2041
+size 676
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..98cec0d7e6dc92f7144f55304e32538985f85f8c
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4ebbd4f6ff72ea59babac30d7a7624cd3ee83fcc59af7206e950636fdfe1974
+size 631535
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..4987732a987c3f14e3f1d8f899ed37c92c7af083
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5223c778f5078592dfb09a0c6f90457e18eae998444aafd04b971ec59b3acb
+size 343607
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9089dea18991e79b3da30e84425a8caf7b1dd97
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_182710_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f13e0a6fed69ba7a0117a87c23d421c2d667c156d4c2d74b43cad338bc5eac0
+size 676
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1203e9e42c9d24897f5ff1194e7dfde45e691d1c
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eec7d75a1480af531ef85712bfc8bda1617ab5b6477485c4c963b2b05aecc5eb
+size 633243
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1db98e6de9702b23350e3a86ef58b56d08f36c64
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9674c29843794ef927878cf36aca0bf8bba690d4b5a51fcac91aa6d229361598
+size 342272
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b10f754eef26089785d363ff3dd2d01dc8897e7
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260218_190731_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2267d9c55ae4ae24d8ebdd680bc4eea3416c1a255a50e8a735be6d60d7509d6
+size 677
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14702de96ad71eecfc7f0bc8151306561fbffa04
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1ba1de2615c8d5997597d4ea4de0fb540090992c44c0229fbda7b0f4c3e5d8f
+size 758975
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..da7fa28feaa4282de051df81d49ee7acd6b0e6da
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa02ab2da3a93f4445ce38e2f722e5a1cf5aface937d38f8d6df1bd8fd3e0807
+size 396783
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e7833c1aecedfc2138352dd5e09ceb797e4a0d0
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205457_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11074cf505493cbac4986da5c013ec679f5adab8b6e274aaf5fd2dc611f05d34
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c15cc6a46b3c0eb86581e8e88a2301c5955dd8be
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6338ae3fd068f10d471c0bd3031904302457192628d387e97ab4b2abbac5240
+size 766571
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..526e1d25bd34f2658ec7a6536d388003cfe7e924
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c995fb2f2fe266128bcfcc129d14c284f5c1abb95731ecd36fd4fd7509aa1d7f
+size 399192
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..84adc6fc79dda385a792df71c746bf31e56653f7
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_205655_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08ed5c98e51b907765ff1355781f3d810601ff0e6c3412bcf0e2dc06e9f7f51b
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9e4872db0ae173329d330524cd0631ba12d32968
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:150975005f27d044a1de21ae8c59d5c4a410b98ca6951225cd9c5e71018db563
+size 764561
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..4e9410a1e936f2ec9b3df88a66a38fffd2eb580a
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86126f6743480db04b95885d215e40c9000c3957c3aa63119cbe8c73bf5ef6de
+size 398845
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..1aa9ca824e4f5a26a0cf32a8bd429593464ffffc
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_210049_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c25290b9ca32838d80037f2e025be430941d015b8be4f55d90f4319f4ed0c2c5
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a4d18589cfa0f3863621371d5335a87912e7c1a
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f2406425f8301f97ba1ff89c8a73e5d947a46058405ab3226e09f6deea7d5ec
+size 762383
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..804e8f1d681cf72fb3f38ba00fae50a0bc9f6c3f
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06e737f9eb4e6c9c758340ffcf9434c987e9a12c422a8beed79bac4eecc6346d
+size 397744
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..348ff1f501a63fcb835a049b73b7d9bec8ae415e
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211032_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64796bad01077b092fb5b2c64c410c47ad47e8d0509c2cb49c3b445768bb3c90
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ab5ced468196feac6a1a18b090bb45b84e66acea
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f0f3aef9770e83272556af1d723bccfe48a3e2973f45498b4512e6e583caa13
+size 760839
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7a3f1980103a3842519c506a74ab683a2c415567
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:812d603ae99016ebb17e1703c9126065ac98023ea7e723f88c97a5bb65772471
+size 396776
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b4bb4835e629fb40012f8d5fadfcb95f24959c3
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_211421_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6634072bcb66007af7d7ba517af3e84ae110518a689e734140f67e8c1bb5870e
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4b1677bd55fdeefae8ea223877ce703187ea4934
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9717914165faace2eb91397e281553b0820105bba3e7c2e9a5eee9e8e5d34212
+size 760303
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e5bd030fc016d5abff66ed73999267f47d73635c
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e3a717b802c9420b64b4cc4a63bd36bd2743a587505226606dec4fec8a14952
+size 394374
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe365332a4fca5a1d3af30c85c79a826b1b1bc5b
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212208_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2ff430cb8c6090956611541924a867cd42de59463250707059f90623e22eea5
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a5cef9dbfea5ab20d6c720c3d1c012337ec6ffb1
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b093f9d5159dc2d4729f6dce4b9a87d7392abecedece20b3b87f7f350bbb56a8
+size 762491
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b23db53dd17961ca9c253f338e2918ee766ea26b
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58957eb824a1b6adabff4e92009bf865bb3d34717f430ab67a5cb5e0250316ed
+size 396480
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..72a41b7bcd4510bab9dd123fdf779f50d9f28d77
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212257_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3653b4a17a6546eeeb9970a06e5a816856fa11c4e6e83cfe32d7daae74843e15
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b9ac46afd188bf3287dab7770f8caa08c54d1dc8
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb3e854fb4b8385f1c9aaa16dd9adadfc342b1e3c5811753531f156f657be258
+size 765631
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..861ff85e4e249fc87f72c5ff70933d86351085c9
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d92d76d81da23c0f4672fced98921a12a901fef4ee74f10b8cb8fdae8ad8577
+size 399211
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..fcc0258715d19625b73cb50b00f886042cffe301
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_212814_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7f1a488609b2f5755727ced1027a4896d36646c562a17210bceaf185b789775
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052.jsonl b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..9c2193b4f4fa23a22f01e65082505372deeb092d
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3b43fe92c366927157ca72bdc9d4ea814a31902b791822a2439bfb96800707a
+size 762195
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052.parquet b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d0307cc2d9ae1e01b295087dd7c5827f34ff9bbc
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b0b263bf2f86c7a1fdedb6391782ce601309862851bc0343ddb8b1ef57af010
+size 398400
diff --git a/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052_meta.json b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c330ea25ba0f92e569a1f46b82c8ae90ef61922
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_home-mshahidul-readctrl-code-rl-model-models-converted-model-v1_20260224_213052_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7281cae07e3b544f2f9c7635818fd427e1e7c31fae6661028817ca55aa44dd85
+size 686
diff --git a/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260213_173334.parquet b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260213_173334.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..bcc1dec075e11e119a18305b89bca6ae68881559
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260213_173334.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86c0fa37986d9c493bcd15bec280ec971a5ec43f53aba856687bd8947deab294
+size 435416
diff --git a/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260213_173334_meta.json b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260213_173334_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..f62a3c3b1760fd8676b430ac7e91145a8ccd3c6e
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260213_173334_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56d0b275c63b5d519faac586a29f7e736215ea405872f60a4e74907bf90c6c65
+size 575
diff --git a/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022.jsonl b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2f0a4b6f70865b59278e64796b9a5617263ada74
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60bda4dda02be8efcd48045c33b4b94e84637f43b1373069d1a218abb0c4e8e4
+size 695711
diff --git a/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022.parquet b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..787b12b5c391f53147daecfc3a46ed00eec7f76d
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022.parquet
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7248e2f30ef2cb5219de833823c1217f9721d0529b654dc13ebca6ded30ce25f
+size 374336
diff --git a/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022_meta.json b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..35ac8eb6de961440656cf1d173d1e9ed048ddd9e
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_inference_qwen-qwen3-4b-instruct-2507_20260217_154022_meta.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc1d53f71d0a01fab4ff1786ee36c3183d7836272d64809826548893636052b2
+size 567
diff --git a/code/RL_model/inference_data/old/vllm_server_20260218_175841.log b/code/RL_model/inference_data/old/vllm_server_20260218_175841.log
new file mode 100644
index 0000000000000000000000000000000000000000..11e2bc60066dbd38ccecfcab8780aa1a195a8279
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260218_175841.log
@@ -0,0 +1,137 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 17:58:45 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 17:58:52 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 17:58:52 [utils.py:233] non-default args: {'port': 8002, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference']}
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 17:59:04 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=1329524)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 17:59:04 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 17:59:04 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 17:59:09 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:15 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:15 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m W0218 17:59:16.479000 1330598 site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m W0218 17:59:16.479000 1330598 site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:17 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:17 [topk_topp_sampler.py:55] Using FlashInfer for top-p & top-k sampling.
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:17 [gpu_model_runner.py:2602] Starting to load model /home/mshahidul/readctrl/code/RL_model/models/converted_model/v1...
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:18 [gpu_model_runner.py:2634] Loading model from scratch...
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:18 [cuda.py:366] Using Flash Attention backend on V1 engine.
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00, ?it/s]
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:00<00:00, 1.23it/s]
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.08it/s]
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.10it/s]
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:20 [default_loader.py:267] Loading weights took 1.94 seconds
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:20 [gpu_model_runner.py:2653] Model loading took 7.6065 GiB and 2.110186 seconds
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:25 [backends.py:548] Using cache directory: /home/mshahidul/.cache/vllm/torch_compile_cache/5b5cb28ceb/rank_0_0/backbone for vLLM's torch.compile
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:25 [backends.py:559] Dynamo bytecode transform time: 4.87 s
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:29 [backends.py:197] Cache the graph for dynamic shape for later use
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:45 [backends.py:218] Compiling a graph for dynamic shape takes 19.55 s
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:54 [monitor.py:34] torch.compile takes 24.41 s in total
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:54 [gpu_worker.py:298] Available KV cache memory: 115.97 GiB
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:54 [kv_cache_utils.py:1087] GPU KV cache size: 844,448 tokens
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 17:59:54 [kv_cache_utils.py:1091] Maximum concurrency for 16,384 tokens per request: 51.54x
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m 2026-02-18 17:59:54,861 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m 2026-02-18 17:59:54,960 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m All deep_gemm operations loaded successfully!
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/67 [00:00, ?it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 1%|▏ | 1/67 [00:00<00:23, 2.78it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 7%|▋ | 5/67 [00:00<00:05, 12.15it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 12%|█▏ | 8/67 [00:00<00:03, 16.91it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 16%|█▋ | 11/67 [00:00<00:02, 20.06it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 21%|██ | 14/67 [00:00<00:02, 22.07it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 25%|██▌ | 17/67 [00:00<00:02, 23.58it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 30%|██▉ | 20/67 [00:01<00:01, 24.55it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 34%|███▍ | 23/67 [00:01<00:01, 25.51it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 39%|███▉ | 26/67 [00:01<00:01, 25.40it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 43%|████▎ | 29/67 [00:01<00:03, 10.43it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 48%|████▊ | 32/67 [00:02<00:02, 12.77it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 52%|█████▏ | 35/67 [00:02<00:02, 14.71it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 57%|█████▋ | 38/67 [00:02<00:01, 16.50it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 61%|██████ | 41/67 [00:02<00:01, 17.94it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 66%|██████▌ | 44/67 [00:02<00:01, 18.92it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 70%|███████ | 47/67 [00:02<00:01, 19.27it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 75%|███████▍ | 50/67 [00:03<00:01, 9.46it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 78%|███████▊ | 52/67 [00:03<00:01, 10.64it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 82%|████████▏ | 55/67 [00:03<00:00, 12.69it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 87%|████████▋ | 58/67 [00:03<00:00, 14.70it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 91%|█████████ | 61/67 [00:03<00:00, 16.13it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 94%|█████████▍| 63/67 [00:04<00:00, 16.43it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 97%|█████████▋| 65/67 [00:04<00:00, 16.94it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:04<00:00, 17.51it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:04<00:00, 15.66it/s]
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m
Capturing CUDA graphs (decode, FULL): 0%| | 0/67 [00:00, ?it/s]
Capturing CUDA graphs (decode, FULL): 1%|▏ | 1/67 [00:00<00:26, 2.44it/s]
Capturing CUDA graphs (decode, FULL): 3%|▎ | 2/67 [00:00<00:19, 3.32it/s]
Capturing CUDA graphs (decode, FULL): 9%|▉ | 6/67 [00:00<00:05, 10.64it/s]
Capturing CUDA graphs (decode, FULL): 15%|█▍ | 10/67 [00:00<00:03, 16.19it/s]
Capturing CUDA graphs (decode, FULL): 19%|█▉ | 13/67 [00:00<00:02, 19.39it/s]
Capturing CUDA graphs (decode, FULL): 25%|██▌ | 17/67 [00:01<00:02, 22.84it/s]
Capturing CUDA graphs (decode, FULL): 31%|███▏ | 21/67 [00:01<00:01, 25.72it/s]
Capturing CUDA graphs (decode, FULL): 37%|███▋ | 25/67 [00:01<00:01, 27.91it/s]
Capturing CUDA graphs (decode, FULL): 43%|████▎ | 29/67 [00:01<00:01, 29.59it/s]
Capturing CUDA graphs (decode, FULL): 49%|████▉ | 33/67 [00:02<00:02, 13.06it/s]
Capturing CUDA graphs (decode, FULL): 55%|█████▌ | 37/67 [00:02<00:01, 16.09it/s]
Capturing CUDA graphs (decode, FULL): 61%|██████ | 41/67 [00:02<00:01, 19.20it/s]
Capturing CUDA graphs (decode, FULL): 67%|██████▋ | 45/67 [00:02<00:01, 21.67it/s]
Capturing CUDA graphs (decode, FULL): 72%|███████▏ | 48/67 [00:02<00:00, 22.76it/s]
Capturing CUDA graphs (decode, FULL): 76%|███████▌ | 51/67 [00:02<00:00, 23.64it/s]
Capturing CUDA graphs (decode, FULL): 82%|████████▏ | 55/67 [00:02<00:00, 26.03it/s]
Capturing CUDA graphs (decode, FULL): 88%|████████▊ | 59/67 [00:02<00:00, 27.92it/s]
Capturing CUDA graphs (decode, FULL): 94%|█████████▍| 63/67 [00:03<00:00, 12.98it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:03<00:00, 15.95it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:03<00:00, 17.70it/s]
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 18:00:03 [gpu_model_runner.py:3480] Graph capturing finished in 9 secs, took -0.21 GiB
+[1;36m(EngineCore_DP0 pid=1330598)[0;0m INFO 02-18 18:00:03 [core.py:210] init engine (profile, create kv cache, warmup model) took 42.85 seconds
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [loggers.py:147] Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 52778
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [api_server.py:1634] Supported_tasks: ['generate']
+[1;36m(APIServer pid=1329524)[0;0m WARNING 02-18 18:00:04 [model.py:1389] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [serving_responses.py:137] Using default chat sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [serving_chat.py:139] Using default chat sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [serving_completion.py:76] Using default completion sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [api_server.py:1912] Starting vLLM API server 0 on http://0.0.0.0:8002
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:34] Available routes are:
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /openapi.json, Methods: GET, HEAD
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /docs, Methods: GET, HEAD
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /docs/oauth2-redirect, Methods: GET, HEAD
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /redoc, Methods: GET, HEAD
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /health, Methods: GET
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /load, Methods: GET
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /ping, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /ping, Methods: GET
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /tokenize, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /detokenize, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/models, Methods: GET
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /version, Methods: GET
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/responses, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/responses/{response_id}, Methods: GET
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/responses/{response_id}/cancel, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/chat/completions, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/completions, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/embeddings, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /pooling, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /classify, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /score, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/score, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/audio/transcriptions, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/audio/translations, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /rerank, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v1/rerank, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /v2/rerank, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /scale_elastic_ep, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /is_scaling_elastic_ep, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /invocations, Methods: POST
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:04 [launcher.py:42] Route: /metrics, Methods: GET
+[1;36m(APIServer pid=1329524)[0;0m INFO: Started server process [1329524]
+[1;36m(APIServer pid=1329524)[0;0m INFO: Waiting for application startup.
+[1;36m(APIServer pid=1329524)[0;0m INFO: Application startup complete.
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:41984 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:41992 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:42008 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:42022 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:14 [loggers.py:127] Engine 000: Avg prompt throughput: 1876.4 tokens/s, Avg generation throughput: 304.4 tokens/s, Running: 8 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:42038 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:53272 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:24 [loggers.py:127] Engine 000: Avg prompt throughput: 2194.5 tokens/s, Avg generation throughput: 776.3 tokens/s, Running: 5 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 15.9%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:53274 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:53276 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:34 [loggers.py:127] Engine 000: Avg prompt throughput: 2141.3 tokens/s, Avg generation throughput: 708.0 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 16.2%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:37290 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:37298 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:44496 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:44 [loggers.py:127] Engine 000: Avg prompt throughput: 3206.3 tokens/s, Avg generation throughput: 590.0 tokens/s, Running: 8 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.4%, Prefix cache hit rate: 16.5%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:44500 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:00:54 [loggers.py:127] Engine 000: Avg prompt throughput: 1419.1 tokens/s, Avg generation throughput: 631.1 tokens/s, Running: 2 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.8%, Prefix cache hit rate: 16.0%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:53644 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:53652 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:60804 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:01:04 [loggers.py:127] Engine 000: Avg prompt throughput: 3004.6 tokens/s, Avg generation throughput: 689.3 tokens/s, Running: 8 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.2%, Prefix cache hit rate: 16.5%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:60820 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:01:14 [loggers.py:127] Engine 000: Avg prompt throughput: 2855.7 tokens/s, Avg generation throughput: 479.4 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 14.7%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:38914 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:38916 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:01:24 [loggers.py:127] Engine 000: Avg prompt throughput: 2417.1 tokens/s, Avg generation throughput: 625.0 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.3%, Prefix cache hit rate: 14.8%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:57066 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:57072 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:60618 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:60622 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:60638 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:01:34 [loggers.py:127] Engine 000: Avg prompt throughput: 5762.9 tokens/s, Avg generation throughput: 867.5 tokens/s, Running: 8 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.9%, Prefix cache hit rate: 15.0%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:60642 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:34346 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:01:44 [loggers.py:127] Engine 000: Avg prompt throughput: 1997.0 tokens/s, Avg generation throughput: 653.5 tokens/s, Running: 3 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.6%, Prefix cache hit rate: 15.2%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:34350 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:43488 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:01:54 [loggers.py:127] Engine 000: Avg prompt throughput: 2116.9 tokens/s, Avg generation throughput: 566.9 tokens/s, Running: 3 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.7%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1329524)[0;0m INFO: 127.0.0.1:43490 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:02:04 [loggers.py:127] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 57.7 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:02:14 [loggers.py:127] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1329524)[0;0m INFO 02-18 18:21:36 [launcher.py:99] Shutting down FastAPI HTTP server.
+[rank0]:[W218 18:21:36.477821731 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=1329524)[0;0m INFO: Shutting down
+[1;36m(APIServer pid=1329524)[0;0m INFO: Waiting for application shutdown.
+[1;36m(APIServer pid=1329524)[0;0m INFO: Application shutdown complete.
diff --git a/code/RL_model/inference_data/old/vllm_server_20260218_182622.log b/code/RL_model/inference_data/old/vllm_server_20260218_182622.log
new file mode 100644
index 0000000000000000000000000000000000000000..17d6e98a06cee6d5933553017f82fd6cf18beb7c
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260218_182622.log
@@ -0,0 +1,135 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 18:26:26 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:26:33 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:26:33 [utils.py:233] non-default args: {'port': 8002, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference']}
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:26:33 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=1385784)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:26:33 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:26:33 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 18:26:37 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:43 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:43 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m W0218 18:26:44.441000 1386320 site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m W0218 18:26:44.441000 1386320 site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:45 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:45 [topk_topp_sampler.py:55] Using FlashInfer for top-p & top-k sampling.
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:45 [gpu_model_runner.py:2602] Starting to load model /home/mshahidul/readctrl/code/RL_model/models/converted_model/v1...
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:45 [gpu_model_runner.py:2634] Loading model from scratch...
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:45 [cuda.py:366] Using Flash Attention backend on V1 engine.
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00, ?it/s]
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:00<00:00, 1.25it/s]
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.10it/s]
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.12it/s]
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:48 [default_loader.py:267] Loading weights took 1.89 seconds
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:48 [gpu_model_runner.py:2653] Model loading took 7.6065 GiB and 2.642895 seconds
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:53 [backends.py:548] Using cache directory: /home/mshahidul/.cache/vllm/torch_compile_cache/5b5cb28ceb/rank_0_0/backbone for vLLM's torch.compile
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:53 [backends.py:559] Dynamo bytecode transform time: 4.82 s
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:55 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.551 s
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:57 [monitor.py:34] torch.compile takes 4.82 s in total
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:57 [gpu_worker.py:298] Available KV cache memory: 115.97 GiB
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:57 [kv_cache_utils.py:1087] GPU KV cache size: 844,464 tokens
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:26:57 [kv_cache_utils.py:1091] Maximum concurrency for 16,384 tokens per request: 51.54x
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m 2026-02-18 18:26:57,908 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m 2026-02-18 18:26:58,008 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m All deep_gemm operations loaded successfully!
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/67 [00:00, ?it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 1%|▏ | 1/67 [00:00<00:28, 2.35it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 7%|▋ | 5/67 [00:00<00:05, 11.19it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 13%|█▎ | 9/67 [00:00<00:03, 17.49it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 18%|█▊ | 12/67 [00:00<00:02, 20.57it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 22%|██▏ | 15/67 [00:00<00:02, 22.92it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 28%|██▊ | 19/67 [00:01<00:01, 25.27it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 33%|███▎ | 22/67 [00:01<00:01, 25.17it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 37%|███▋ | 25/67 [00:01<00:01, 25.18it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 42%|████▏ | 28/67 [00:01<00:03, 10.62it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 46%|████▋ | 31/67 [00:02<00:02, 13.17it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 51%|█████ | 34/67 [00:02<00:02, 15.28it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 55%|█████▌ | 37/67 [00:02<00:01, 17.58it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 60%|█████▉ | 40/67 [00:02<00:01, 19.26it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 64%|██████▍ | 43/67 [00:02<00:01, 20.39it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 69%|██████▊ | 46/67 [00:02<00:00, 21.48it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 73%|███████▎ | 49/67 [00:02<00:00, 22.07it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 78%|███████▊ | 52/67 [00:03<00:00, 15.25it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 81%|████████ | 54/67 [00:03<00:01, 10.19it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 85%|████████▌ | 57/67 [00:03<00:00, 12.27it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|████████▉ | 60/67 [00:03<00:00, 14.41it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 94%|█████████▍| 63/67 [00:03<00:00, 16.14it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 99%|█████████▊| 66/67 [00:04<00:00, 18.30it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:04<00:00, 16.43it/s]
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m
Capturing CUDA graphs (decode, FULL): 0%| | 0/67 [00:00, ?it/s]
Capturing CUDA graphs (decode, FULL): 4%|▍ | 3/67 [00:00<00:02, 29.62it/s]
Capturing CUDA graphs (decode, FULL): 10%|█ | 7/67 [00:00<00:01, 30.93it/s]
Capturing CUDA graphs (decode, FULL): 16%|█▋ | 11/67 [00:00<00:05, 10.21it/s]
Capturing CUDA graphs (decode, FULL): 22%|██▏ | 15/67 [00:01<00:03, 14.31it/s]
Capturing CUDA graphs (decode, FULL): 28%|██▊ | 19/67 [00:01<00:02, 18.35it/s]
Capturing CUDA graphs (decode, FULL): 34%|███▍ | 23/67 [00:01<00:01, 22.21it/s]
Capturing CUDA graphs (decode, FULL): 40%|████ | 27/67 [00:01<00:01, 25.66it/s]
Capturing CUDA graphs (decode, FULL): 46%|████▋ | 31/67 [00:01<00:01, 28.02it/s]
Capturing CUDA graphs (decode, FULL): 52%|█████▏ | 35/67 [00:01<00:01, 28.92it/s]
Capturing CUDA graphs (decode, FULL): 58%|█████▊ | 39/67 [00:01<00:00, 30.39it/s]
Capturing CUDA graphs (decode, FULL): 64%|██████▍ | 43/67 [00:01<00:00, 31.76it/s]
Capturing CUDA graphs (decode, FULL): 70%|███████ | 47/67 [00:02<00:01, 13.48it/s]
Capturing CUDA graphs (decode, FULL): 76%|███████▌ | 51/67 [00:02<00:00, 16.32it/s]
Capturing CUDA graphs (decode, FULL): 82%|████████▏ | 55/67 [00:02<00:00, 19.44it/s]
Capturing CUDA graphs (decode, FULL): 88%|████████▊ | 59/67 [00:02<00:00, 22.75it/s]
Capturing CUDA graphs (decode, FULL): 94%|█████████▍| 63/67 [00:02<00:00, 25.72it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:03<00:00, 28.74it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 67/67 [00:03<00:00, 21.88it/s]
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:27:05 [gpu_model_runner.py:3480] Graph capturing finished in 8 secs, took -0.21 GiB
+[1;36m(EngineCore_DP0 pid=1386320)[0;0m INFO 02-18 18:27:05 [core.py:210] init engine (profile, create kv cache, warmup model) took 16.67 seconds
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [loggers.py:147] Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 52779
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [api_server.py:1634] Supported_tasks: ['generate']
+[1;36m(APIServer pid=1385784)[0;0m WARNING 02-18 18:27:06 [model.py:1389] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [serving_responses.py:137] Using default chat sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [serving_chat.py:139] Using default chat sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [serving_completion.py:76] Using default completion sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [api_server.py:1912] Starting vLLM API server 0 on http://0.0.0.0:8002
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:34] Available routes are:
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /openapi.json, Methods: HEAD, GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /docs, Methods: HEAD, GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /docs/oauth2-redirect, Methods: HEAD, GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /redoc, Methods: HEAD, GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /health, Methods: GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /load, Methods: GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /ping, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /ping, Methods: GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /tokenize, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /detokenize, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/models, Methods: GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /version, Methods: GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/responses, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/responses/{response_id}, Methods: GET
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/responses/{response_id}/cancel, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/chat/completions, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/completions, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/embeddings, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /pooling, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /classify, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /score, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/score, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/audio/transcriptions, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/audio/translations, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /rerank, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v1/rerank, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /v2/rerank, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /scale_elastic_ep, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /is_scaling_elastic_ep, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /invocations, Methods: POST
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:06 [launcher.py:42] Route: /metrics, Methods: GET
+[1;36m(APIServer pid=1385784)[0;0m INFO: Started server process [1385784]
+[1;36m(APIServer pid=1385784)[0;0m INFO: Waiting for application startup.
+[1;36m(APIServer pid=1385784)[0;0m INFO: Application startup complete.
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:37016 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:37024 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:37026 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:37038 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:16 [loggers.py:127] Engine 000: Avg prompt throughput: 1893.0 tokens/s, Avg generation throughput: 418.8 tokens/s, Running: 3 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.6%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:37046 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:51530 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:26 [loggers.py:127] Engine 000: Avg prompt throughput: 2194.5 tokens/s, Avg generation throughput: 751.7 tokens/s, Running: 2 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.6%, Prefix cache hit rate: 15.9%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:51532 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:51120 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:51136 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:36 [loggers.py:127] Engine 000: Avg prompt throughput: 3003.8 tokens/s, Avg generation throughput: 747.5 tokens/s, Running: 7 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.9%, Prefix cache hit rate: 16.8%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:51138 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:60442 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:46 [loggers.py:127] Engine 000: Avg prompt throughput: 2344.1 tokens/s, Avg generation throughput: 654.0 tokens/s, Running: 5 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 16.5%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:60456 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:27:56 [loggers.py:127] Engine 000: Avg prompt throughput: 1419.2 tokens/s, Avg generation throughput: 511.4 tokens/s, Running: 2 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.8%, Prefix cache hit rate: 16.0%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:50242 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:35680 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:35696 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:28:06 [loggers.py:127] Engine 000: Avg prompt throughput: 3004.8 tokens/s, Avg generation throughput: 728.7 tokens/s, Running: 5 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.9%, Prefix cache hit rate: 16.5%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:35706 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:45426 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:28:16 [loggers.py:127] Engine 000: Avg prompt throughput: 4100.3 tokens/s, Avg generation throughput: 546.8 tokens/s, Running: 7 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.3%, Prefix cache hit rate: 14.7%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:45440 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:58716 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:28:26 [loggers.py:127] Engine 000: Avg prompt throughput: 2269.7 tokens/s, Avg generation throughput: 659.4 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 14.9%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:58718 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:58724 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:42562 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:42566 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:28:36 [loggers.py:127] Engine 000: Avg prompt throughput: 4665.7 tokens/s, Avg generation throughput: 899.0 tokens/s, Running: 2 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 15.0%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:42578 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:42592 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:59626 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:28:46 [loggers.py:127] Engine 000: Avg prompt throughput: 2940.9 tokens/s, Avg generation throughput: 540.9 tokens/s, Running: 8 reqs, Waiting: 0 reqs, GPU KV cache usage: 1.0%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:59642 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO: 127.0.0.1:59120 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:28:56 [loggers.py:127] Engine 000: Avg prompt throughput: 1173.3 tokens/s, Avg generation throughput: 511.1 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:29:06 [loggers.py:127] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1385784)[0;0m INFO 02-18 18:55:54 [launcher.py:99] Shutting down FastAPI HTTP server.
+[rank0]:[W218 18:55:54.021671499 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=1385784)[0;0m INFO: Shutting down
+[1;36m(APIServer pid=1385784)[0;0m INFO: Waiting for application shutdown.
+[1;36m(APIServer pid=1385784)[0;0m INFO: Application shutdown complete.
diff --git a/code/RL_model/inference_data/old/vllm_server_20260218_190354.log b/code/RL_model/inference_data/old/vllm_server_20260218_190354.log
new file mode 100644
index 0000000000000000000000000000000000000000..c9154a6dfb5ad0245a0ee6f007d762e2c8650d4e
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260218_190354.log
@@ -0,0 +1,81 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 19:03:59 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(APIServer pid=1459651)[0;0m INFO 02-18 19:04:05 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=1459651)[0;0m INFO 02-18 19:04:05 [utils.py:233] non-default args: {'port': 8002, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference']}
+[1;36m(APIServer pid=1459651)[0;0m INFO 02-18 19:04:05 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=1459651)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=1459651)[0;0m INFO 02-18 19:04:05 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=1459651)[0;0m INFO 02-18 19:04:05 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 19:04:10 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(APIServer pid=1459651)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=1459651)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=1459651)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=1459651)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=1459651)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=1459651)[0;0m return runner.run(main)
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=1459651)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=1459651)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=1459651)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=1459651)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=1459651)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=1459651)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=1459651)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=1459651)[0;0m return await main
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=1459651)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=1459651)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=1459651)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=1459651)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=1459651)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=1459651)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=1459651)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=1459651)[0;0m return cls(
+[1;36m(APIServer pid=1459651)[0;0m ^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=1459651)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=1459651)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=1459651)[0;0m super().__init__(
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=1459651)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=1459651)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=1459651)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=1459651)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=1459651)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=1459651)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=1459651)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=1459651)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=1459651)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260218_190643.log b/code/RL_model/inference_data/old/vllm_server_20260218_190643.log
new file mode 100644
index 0000000000000000000000000000000000000000..4503b78902b4667b71732a9f134d409b20020bb1
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260218_190643.log
@@ -0,0 +1,107 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 19:06:47 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-18 19:06:54 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:06:54 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:06:54 [utils.py:233] non-default args: {'port': 8002, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:06:54 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=1464962)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:06:54 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:06:54 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-18 19:06:58 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:05 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:05 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m W0218 19:07:06.066000 1465504 site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m W0218 19:07:06.066000 1465504 site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:07 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:07 [topk_topp_sampler.py:55] Using FlashInfer for top-p & top-k sampling.
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:07 [gpu_model_runner.py:2602] Starting to load model /home/mshahidul/readctrl/code/RL_model/models/converted_model/v1...
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:07 [gpu_model_runner.py:2634] Loading model from scratch...
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:07 [cuda.py:366] Using Flash Attention backend on V1 engine.
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m
Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00, ?it/s]
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m
Loading safetensors checkpoint shards: 50% Completed | 1/2 [00:00<00:00, 1.96it/s]
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.72it/s]
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00, 1.75it/s]
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:09 [default_loader.py:267] Loading weights took 1.20 seconds
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:10 [gpu_model_runner.py:2653] Model loading took 7.6065 GiB and 1.960170 seconds
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:14 [backends.py:548] Using cache directory: /home/mshahidul/.cache/vllm/torch_compile_cache/5b5cb28ceb/rank_0_0/backbone for vLLM's torch.compile
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:14 [backends.py:559] Dynamo bytecode transform time: 4.75 s
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:17 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.598 s
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:17 [monitor.py:34] torch.compile takes 4.75 s in total
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:18 [gpu_worker.py:298] Available KV cache memory: 124.41 GiB
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:19 [kv_cache_utils.py:1087] GPU KV cache size: 905,952 tokens
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:19 [kv_cache_utils.py:1091] Maximum concurrency for 16,384 tokens per request: 55.29x
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m 2026-02-18 19:07:19,141 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m 2026-02-18 19:07:19,232 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m All deep_gemm operations loaded successfully!
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/67 [00:00, ?it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 4%|▍ | 3/67 [00:00<00:02, 29.74it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 10%|█ | 7/67 [00:00<00:01, 31.70it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 16%|█▋ | 11/67 [00:00<00:01, 29.47it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 21%|██ | 14/67 [00:01<00:05, 10.26it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 25%|██▌ | 17/67 [00:01<00:03, 13.03it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 30%|██▉ | 20/67 [00:01<00:02, 15.83it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 36%|███▌ | 24/67 [00:01<00:02, 19.37it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 40%|████ | 27/67 [00:01<00:01, 20.98it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 45%|████▍ | 30/67 [00:01<00:01, 22.55it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 49%|████▉ | 33/67 [00:01<00:01, 23.23it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 54%|█████▎ | 36/67 [00:01<00:01, 24.45it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 58%|█████▊ | 39/67 [00:02<00:02, 10.65it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 63%|██████▎ | 42/67 [00:02<00:01, 12.55it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 67%|██████▋ | 45/67 [00:02<00:01, 14.13it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 72%|███████▏ | 48/67 [00:02<00:01, 15.69it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 76%|███████▌ | 51/67 [00:03<00:00, 16.63it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 81%|████████ | 54/67 [00:03<00:00, 17.35it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 84%|████████▎ | 56/67 [00:03<00:00, 17.83it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 87%|████████▋ | 58/67 [00:03<00:00, 14.62it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%|████████▉ | 60/67 [00:04<00:00, 8.78it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 94%|█████████▍| 63/67 [00:04<00:00, 11.35it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 99%|█████████▊| 66/67 [00:04<00:00, 13.72it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:04<00:00, 15.46it/s]
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m
Capturing CUDA graphs (decode, FULL): 0%| | 0/35 [00:00, ?it/s]
Capturing CUDA graphs (decode, FULL): 11%|█▏ | 4/35 [00:00<00:00, 32.11it/s]
Capturing CUDA graphs (decode, FULL): 23%|██▎ | 8/35 [00:00<00:00, 33.43it/s]
Capturing CUDA graphs (decode, FULL): 34%|███▍ | 12/35 [00:00<00:00, 32.65it/s]
Capturing CUDA graphs (decode, FULL): 46%|████▌ | 16/35 [00:00<00:00, 32.00it/s]
Capturing CUDA graphs (decode, FULL): 57%|█████▋ | 20/35 [00:01<00:01, 12.25it/s]
Capturing CUDA graphs (decode, FULL): 69%|██████▊ | 24/35 [00:01<00:00, 15.88it/s]
Capturing CUDA graphs (decode, FULL): 80%|████████ | 28/35 [00:01<00:00, 19.67it/s]
Capturing CUDA graphs (decode, FULL): 91%|█████████▏| 32/35 [00:01<00:00, 23.35it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 22.32it/s]
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:25 [gpu_model_runner.py:3480] Graph capturing finished in 6 secs, took -0.38 GiB
+[1;36m(EngineCore_DP0 pid=1465504)[0;0m INFO 02-18 19:07:25 [core.py:210] init engine (profile, create kv cache, warmup model) took 15.60 seconds
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [loggers.py:147] Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 56622
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [api_server.py:1634] Supported_tasks: ['generate']
+[1;36m(APIServer pid=1464962)[0;0m WARNING 02-18 19:07:26 [model.py:1389] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [serving_responses.py:137] Using default chat sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [serving_chat.py:139] Using default chat sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [serving_completion.py:76] Using default completion sampling params from model: {'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [api_server.py:1912] Starting vLLM API server 0 on http://0.0.0.0:8002
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:34] Available routes are:
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /openapi.json, Methods: GET, HEAD
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /docs, Methods: GET, HEAD
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /docs/oauth2-redirect, Methods: GET, HEAD
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /redoc, Methods: GET, HEAD
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /health, Methods: GET
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /load, Methods: GET
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /ping, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /ping, Methods: GET
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /tokenize, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /detokenize, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/models, Methods: GET
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /version, Methods: GET
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/responses, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/responses/{response_id}, Methods: GET
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/responses/{response_id}/cancel, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/chat/completions, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/completions, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/embeddings, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /pooling, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /classify, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /score, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/score, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/audio/transcriptions, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/audio/translations, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /rerank, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v1/rerank, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /v2/rerank, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /scale_elastic_ep, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /is_scaling_elastic_ep, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /invocations, Methods: POST
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:26 [launcher.py:42] Route: /metrics, Methods: GET
+[1;36m(APIServer pid=1464962)[0;0m INFO: Started server process [1464962]
+[1;36m(APIServer pid=1464962)[0;0m INFO: Waiting for application startup.
+[1;36m(APIServer pid=1464962)[0;0m INFO: Application startup complete.
+[1;36m(APIServer pid=1464962)[0;0m INFO: 127.0.0.1:45426 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1464962)[0;0m INFO: 127.0.0.1:45428 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1464962)[0;0m INFO: 127.0.0.1:45444 - "GET /v1/models HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:36 [loggers.py:127] Engine 000: Avg prompt throughput: 27596.9 tokens/s, Avg generation throughput: 274.9 tokens/s, Running: 196 reqs, Waiting: 4 reqs, GPU KV cache usage: 27.1%, Prefix cache hit rate: 15.3%
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:46 [loggers.py:127] Engine 000: Avg prompt throughput: 568.9 tokens/s, Avg generation throughput: 6575.1 tokens/s, Running: 13 reqs, Waiting: 0 reqs, GPU KV cache usage: 3.6%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1464962)[0;0m INFO: 127.0.0.1:45472 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1464962)[0;0m INFO: 127.0.0.1:45470 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1464962)[0;0m INFO: 127.0.0.1:45454 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1464962)[0;0m INFO: 127.0.0.1:45464 - "POST /v1/completions HTTP/1.1" 200 OK
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:07:56 [loggers.py:127] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 140.8 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:08:06 [loggers.py:127] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 15.4%
+[1;36m(APIServer pid=1464962)[0;0m INFO 02-18 19:35:15 [launcher.py:99] Shutting down FastAPI HTTP server.
+[rank0]:[W218 19:35:15.198705064 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=1464962)[0;0m INFO: Shutting down
+[1;36m(APIServer pid=1464962)[0;0m INFO: Waiting for application shutdown.
+[1;36m(APIServer pid=1464962)[0;0m INFO: Application shutdown complete.
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_204825.log b/code/RL_model/inference_data/old/vllm_server_20260224_204825.log
new file mode 100644
index 0000000000000000000000000000000000000000..f38296ab9cda0940ac6149ecba9c2443c9804dd0
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_204825.log
@@ -0,0 +1,3 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 20:48:30 [__init__.py:216] Automatically detected platform cuda.
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_205454.log b/code/RL_model/inference_data/old/vllm_server_20260224_205454.log
new file mode 100644
index 0000000000000000000000000000000000000000..04e69ead2102bb60aa6b57f72dbc62ec1542b98c
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_205454.log
@@ -0,0 +1,152 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 20:54:58 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 20:55:05 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=3942112)[0;0m INFO 02-24 20:55:05 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=3942112)[0;0m INFO 02-24 20:55:05 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=3942112)[0;0m INFO 02-24 20:55:05 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=3942112)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=3942112)[0;0m INFO 02-24 20:55:05 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=3942112)[0;0m INFO 02-24 20:55:05 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 20:55:10 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m INFO 02-24 20:55:17 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m INFO 02-24 20:55:17 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m W0224 20:55:17.867000 3943049 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m W0224 20:55:17.867000 3943049 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m INFO 02-24 20:55:18 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] EngineCore failed to start.
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] self._init_executor()
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] raise ValueError(
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ERROR 02-24 20:55:19 [core.py:708] ValueError: Free memory on device (32.15/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m Process EngineCore_DP0:
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m self.run()
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 108, in run
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m self._target(*self._args, **self._kwargs)
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m raise e
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m self._init_executor()
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m raise ValueError(
+[1;36m(EngineCore_DP0 pid=3943049)[0;0m ValueError: Free memory on device (32.15/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[rank0]:[W224 20:55:19.574819090 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=3942112)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=3942112)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=3942112)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=3942112)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=3942112)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=3942112)[0;0m return runner.run(main)
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=3942112)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3942112)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3942112)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=3942112)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=3942112)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=3942112)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3942112)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=3942112)[0;0m return await main
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=3942112)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=3942112)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3942112)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=3942112)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3942112)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=3942112)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=3942112)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=3942112)[0;0m return cls(
+[1;36m(APIServer pid=3942112)[0;0m ^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=3942112)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=3942112)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=3942112)[0;0m super().__init__(
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=3942112)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=3942112)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=3942112)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=3942112)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3942112)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=3942112)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=3942112)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=3942112)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=3942112)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_205652.log b/code/RL_model/inference_data/old/vllm_server_20260224_205652.log
new file mode 100644
index 0000000000000000000000000000000000000000..6aa721ce6115aca64061296a9b5c99d0ff35c4c9
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_205652.log
@@ -0,0 +1,132 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 20:56:56 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 20:57:03 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=3948098)[0;0m INFO 02-24 20:57:03 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=3948098)[0;0m INFO 02-24 20:57:03 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=3948098)[0;0m INFO 02-24 20:57:03 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=3948098)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=3948098)[0;0m INFO 02-24 20:57:03 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=3948098)[0;0m INFO 02-24 20:57:03 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 20:57:08 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=3948975)[0;0m INFO 02-24 20:57:15 [core.py:644] Waiting for init message from front-end.
+[1;36m(APIServer pid=3948098)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=3948098)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=3948098)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=3948098)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=3948098)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=3948098)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=3948098)[0;0m KeyboardInterrupt: terminated
+[1;36m(APIServer pid=3948098)[0;0m
+[1;36m(APIServer pid=3948098)[0;0m During handling of the above exception, another exception occurred:
+[1;36m(APIServer pid=3948098)[0;0m
+[1;36m(APIServer pid=3948098)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=3948098)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=3948098)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=3948098)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=3948098)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=3948098)[0;0m return runner.run(main)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=3948098)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3948098)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3948098)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=3948098)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=3948098)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=3948098)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3948098)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=3948098)[0;0m return await main
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=3948098)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=3948098)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3948098)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=3948098)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3948098)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=3948098)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=3948098)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=3948098)[0;0m return cls(
+[1;36m(APIServer pid=3948098)[0;0m ^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=3948098)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=3948098)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=3948098)[0;0m super().__init__(
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 515, in __init__
+[1;36m(APIServer pid=3948098)[0;0m self._finalizer()
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=3948098)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 344, in __call__
+[1;36m(APIServer pid=3948098)[0;0m self.engine_manager.close()
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 141, in close
+[1;36m(APIServer pid=3948098)[0;0m self._finalizer()
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=3948098)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/utils.py", line 315, in shutdown
+[1;36m(APIServer pid=3948098)[0;0m proc.join(remaining)
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 149, in join
+[1;36m(APIServer pid=3948098)[0;0m res = self._popen.wait(timeout)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/popen_fork.py", line 40, in wait
+[1;36m(APIServer pid=3948098)[0;0m if not wait([self.sentinel], timeout):
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
+[1;36m(APIServer pid=3948098)[0;0m ready = selector.select(timeout)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/selectors.py", line 415, in select
+[1;36m(APIServer pid=3948098)[0;0m fd_event_list = self._selector.poll(timeout)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=3948098)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=3948098)[0;0m KeyboardInterrupt: terminated
+[1;36m(APIServer pid=3948098)[0;0m Exception ignored in atexit callback:
+[1;36m(APIServer pid=3948098)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/util.py", line 360, in _exit_function
+[1;36m(APIServer pid=3948098)[0;0m p.join()
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 149, in join
+[1;36m(APIServer pid=3948098)[0;0m res = self._popen.wait(timeout)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/popen_fork.py", line 43, in wait
+[1;36m(APIServer pid=3948098)[0;0m return self.poll(os.WNOHANG if timeout == 0.0 else 0)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/popen_fork.py", line 27, in poll
+[1;36m(APIServer pid=3948098)[0;0m pid, sts = os.waitpid(self.pid, flag)
+[1;36m(APIServer pid=3948098)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3948098)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=3948098)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=3948098)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_210046.log b/code/RL_model/inference_data/old/vllm_server_20260224_210046.log
new file mode 100644
index 0000000000000000000000000000000000000000..2642fe8414241798b360660b6e13236bf6376d79
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_210046.log
@@ -0,0 +1,152 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:00:50 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 21:00:57 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=3961549)[0;0m INFO 02-24 21:00:57 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=3961549)[0;0m INFO 02-24 21:00:57 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=3961549)[0;0m INFO 02-24 21:00:57 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=3961549)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=3961549)[0;0m INFO 02-24 21:00:57 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=3961549)[0;0m INFO 02-24 21:00:57 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:01:01 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m INFO 02-24 21:01:07 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m INFO 02-24 21:01:07 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m W0224 21:01:08.007000 3962394 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m W0224 21:01:08.007000 3962394 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m INFO 02-24 21:01:08 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] EngineCore failed to start.
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] self._init_executor()
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] raise ValueError(
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ERROR 02-24 21:01:08 [core.py:708] ValueError: Free memory on device (72.97/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m Process EngineCore_DP0:
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m self.run()
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 108, in run
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m self._target(*self._args, **self._kwargs)
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m raise e
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m self._init_executor()
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m raise ValueError(
+[1;36m(EngineCore_DP0 pid=3962394)[0;0m ValueError: Free memory on device (72.97/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[rank0]:[W224 21:01:09.210627170 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=3961549)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=3961549)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=3961549)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=3961549)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=3961549)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=3961549)[0;0m return runner.run(main)
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=3961549)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3961549)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3961549)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=3961549)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=3961549)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=3961549)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3961549)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=3961549)[0;0m return await main
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=3961549)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=3961549)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3961549)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=3961549)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3961549)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=3961549)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=3961549)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=3961549)[0;0m return cls(
+[1;36m(APIServer pid=3961549)[0;0m ^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=3961549)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=3961549)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=3961549)[0;0m super().__init__(
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=3961549)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=3961549)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=3961549)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=3961549)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3961549)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=3961549)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=3961549)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=3961549)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=3961549)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_211029.log b/code/RL_model/inference_data/old/vllm_server_20260224_211029.log
new file mode 100644
index 0000000000000000000000000000000000000000..21bd1716a5fbffbb6931964237ad6a883c0ff41a
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_211029.log
@@ -0,0 +1,152 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:10:33 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 21:10:39 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=3991959)[0;0m INFO 02-24 21:10:39 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=3991959)[0;0m INFO 02-24 21:10:39 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=3991959)[0;0m INFO 02-24 21:10:39 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=3991959)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=3991959)[0;0m INFO 02-24 21:10:39 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=3991959)[0;0m INFO 02-24 21:10:39 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:10:43 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m INFO 02-24 21:10:50 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m INFO 02-24 21:10:50 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m W0224 21:10:51.066000 3992694 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m W0224 21:10:51.066000 3992694 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m INFO 02-24 21:10:51 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] EngineCore failed to start.
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] self._init_executor()
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] raise ValueError(
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ERROR 02-24 21:10:51 [core.py:708] ValueError: Free memory on device (73.18/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m Process EngineCore_DP0:
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m self.run()
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 108, in run
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m self._target(*self._args, **self._kwargs)
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m raise e
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m self._init_executor()
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m raise ValueError(
+[1;36m(EngineCore_DP0 pid=3992694)[0;0m ValueError: Free memory on device (73.18/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[rank0]:[W224 21:10:52.339976296 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=3991959)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=3991959)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=3991959)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=3991959)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=3991959)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=3991959)[0;0m return runner.run(main)
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=3991959)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3991959)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=3991959)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=3991959)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=3991959)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=3991959)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3991959)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=3991959)[0;0m return await main
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=3991959)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=3991959)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3991959)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=3991959)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=3991959)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=3991959)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=3991959)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=3991959)[0;0m return cls(
+[1;36m(APIServer pid=3991959)[0;0m ^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=3991959)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=3991959)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=3991959)[0;0m super().__init__(
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=3991959)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=3991959)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=3991959)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=3991959)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=3991959)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=3991959)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=3991959)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=3991959)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=3991959)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_211416.log b/code/RL_model/inference_data/old/vllm_server_20260224_211416.log
new file mode 100644
index 0000000000000000000000000000000000000000..b4a08356ca324f5104670d84acecf9a7a4eb4904
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_211416.log
@@ -0,0 +1,142 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:14:24 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 21:14:30 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=4002948)[0;0m INFO 02-24 21:14:30 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=4002948)[0;0m INFO 02-24 21:14:30 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=4002948)[0;0m INFO 02-24 21:14:30 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=4002948)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=4002948)[0;0m INFO 02-24 21:14:30 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=4002948)[0;0m INFO 02-24 21:14:30 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:14:34 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m INFO 02-24 21:14:42 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m INFO 02-24 21:14:42 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m W0224 21:14:43.004000 4003723 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m W0224 21:14:43.004000 4003723 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m INFO 02-24 21:14:44 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] EngineCore failed to start.
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] self._init_executor()
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] raise ValueError(
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ERROR 02-24 21:14:44 [core.py:708] ValueError: Free memory on device (32.91/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m Process EngineCore_DP0:
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m self.run()
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 108, in run
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m self._target(*self._args, **self._kwargs)
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m raise e
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m self._init_executor()
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m raise ValueError(
+[1;36m(EngineCore_DP0 pid=4003723)[0;0m ValueError: Free memory on device (32.91/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[rank0]:[W224 21:14:45.958554367 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=4002948)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4002948)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=4002948)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=4002948)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=4002948)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=4002948)[0;0m return runner.run(main)
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=4002948)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=4002948)[0;0m return await main
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=4002948)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=4002948)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4002948)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=4002948)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4002948)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=4002948)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=4002948)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=4002948)[0;0m return cls(
+[1;36m(APIServer pid=4002948)[0;0m ^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=4002948)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=4002948)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=4002948)[0;0m super().__init__(
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 448, in __init__
+[1;36m(APIServer pid=4002948)[0;0m with launch_core_engines(vllm_config, executor_class,
+[1;36m(APIServer pid=4002948)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 144, in __exit__
+[1;36m(APIServer pid=4002948)[0;0m next(self.gen)
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 732, in launch_core_engines
+[1;36m(APIServer pid=4002948)[0;0m wait_for_engine_startup(
+[1;36m(APIServer pid=4002948)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 785, in wait_for_engine_startup
+[1;36m(APIServer pid=4002948)[0;0m raise RuntimeError("Engine core initialization failed. "
+[1;36m(APIServer pid=4002948)[0;0m RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_212205.log b/code/RL_model/inference_data/old/vllm_server_20260224_212205.log
new file mode 100644
index 0000000000000000000000000000000000000000..45ffa8eeb07e09b4294895bb0f775f7faabc5021
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_212205.log
@@ -0,0 +1,115 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:22:09 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 21:22:16 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=4027661)[0;0m INFO 02-24 21:22:16 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=4027661)[0;0m INFO 02-24 21:22:16 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=4027661)[0;0m INFO 02-24 21:22:16 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=4027661)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=4027661)[0;0m INFO 02-24 21:22:16 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=4027661)[0;0m INFO 02-24 21:22:16 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:22:20 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(APIServer pid=4027661)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=4027661)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=4027661)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=4027661)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=4027661)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=4027661)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=4027661)[0;0m KeyboardInterrupt: terminated
+[1;36m(APIServer pid=4027661)[0;0m
+[1;36m(APIServer pid=4027661)[0;0m During handling of the above exception, another exception occurred:
+[1;36m(APIServer pid=4027661)[0;0m
+[1;36m(APIServer pid=4027661)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4027661)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=4027661)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=4027661)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=4027661)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=4027661)[0;0m return runner.run(main)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=4027661)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4027661)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4027661)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=4027661)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=4027661)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=4027661)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=4027661)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=4027661)[0;0m return await main
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=4027661)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=4027661)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4027661)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=4027661)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4027661)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=4027661)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=4027661)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=4027661)[0;0m return cls(
+[1;36m(APIServer pid=4027661)[0;0m ^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=4027661)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=4027661)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=4027661)[0;0m super().__init__(
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 515, in __init__
+[1;36m(APIServer pid=4027661)[0;0m self._finalizer()
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=4027661)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 344, in __call__
+[1;36m(APIServer pid=4027661)[0;0m self.engine_manager.close()
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 141, in close
+[1;36m(APIServer pid=4027661)[0;0m self._finalizer()
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=4027661)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/utils.py", line 315, in shutdown
+[1;36m(APIServer pid=4027661)[0;0m proc.join(remaining)
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 149, in join
+[1;36m(APIServer pid=4027661)[0;0m res = self._popen.wait(timeout)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/popen_fork.py", line 40, in wait
+[1;36m(APIServer pid=4027661)[0;0m if not wait([self.sentinel], timeout):
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
+[1;36m(APIServer pid=4027661)[0;0m ready = selector.select(timeout)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/selectors.py", line 415, in select
+[1;36m(APIServer pid=4027661)[0;0m fd_event_list = self._selector.poll(timeout)
+[1;36m(APIServer pid=4027661)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4027661)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=4027661)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=4027661)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_212254.log b/code/RL_model/inference_data/old/vllm_server_20260224_212254.log
new file mode 100644
index 0000000000000000000000000000000000000000..80d5c705abcb134217a317d546f4a5d8a1e604c4
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_212254.log
@@ -0,0 +1,116 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:22:59 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 21:23:05 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=4030408)[0;0m INFO 02-24 21:23:05 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=4030408)[0;0m INFO 02-24 21:23:05 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=4030408)[0;0m INFO 02-24 21:23:05 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=4030408)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=4030408)[0;0m INFO 02-24 21:23:05 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=4030408)[0;0m INFO 02-24 21:23:05 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:23:10 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=4031207)[0;0m INFO 02-24 21:23:16 [core.py:644] Waiting for init message from front-end.
+[1;36m(APIServer pid=4030408)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=4030408)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=4030408)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=4030408)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=4030408)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=4030408)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=4030408)[0;0m KeyboardInterrupt: terminated
+[1;36m(APIServer pid=4030408)[0;0m
+[1;36m(APIServer pid=4030408)[0;0m During handling of the above exception, another exception occurred:
+[1;36m(APIServer pid=4030408)[0;0m
+[1;36m(APIServer pid=4030408)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4030408)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=4030408)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=4030408)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=4030408)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=4030408)[0;0m return runner.run(main)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=4030408)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4030408)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4030408)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=4030408)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=4030408)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=4030408)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=4030408)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=4030408)[0;0m return await main
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=4030408)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=4030408)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4030408)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=4030408)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4030408)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=4030408)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=4030408)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=4030408)[0;0m return cls(
+[1;36m(APIServer pid=4030408)[0;0m ^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=4030408)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=4030408)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=4030408)[0;0m super().__init__(
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 515, in __init__
+[1;36m(APIServer pid=4030408)[0;0m self._finalizer()
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=4030408)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 344, in __call__
+[1;36m(APIServer pid=4030408)[0;0m self.engine_manager.close()
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 141, in close
+[1;36m(APIServer pid=4030408)[0;0m self._finalizer()
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=4030408)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/utils.py", line 315, in shutdown
+[1;36m(APIServer pid=4030408)[0;0m proc.join(remaining)
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 149, in join
+[1;36m(APIServer pid=4030408)[0;0m res = self._popen.wait(timeout)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/popen_fork.py", line 40, in wait
+[1;36m(APIServer pid=4030408)[0;0m if not wait([self.sentinel], timeout):
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
+[1;36m(APIServer pid=4030408)[0;0m ready = selector.select(timeout)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/selectors.py", line 415, in select
+[1;36m(APIServer pid=4030408)[0;0m fd_event_list = self._selector.poll(timeout)
+[1;36m(APIServer pid=4030408)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4030408)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=4030408)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=4030408)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_212811.log b/code/RL_model/inference_data/old/vllm_server_20260224_212811.log
new file mode 100644
index 0000000000000000000000000000000000000000..ec794db23b89c804758a75dec903a4c4560907e2
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_212811.log
@@ -0,0 +1,116 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:28:15 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 21:28:20 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=4047170)[0;0m INFO 02-24 21:28:20 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=4047170)[0;0m INFO 02-24 21:28:20 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=4047170)[0;0m INFO 02-24 21:28:20 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=4047170)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=4047170)[0;0m INFO 02-24 21:28:20 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=4047170)[0;0m INFO 02-24 21:28:21 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:28:25 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=4048005)[0;0m INFO 02-24 21:28:31 [core.py:644] Waiting for init message from front-end.
+[1;36m(APIServer pid=4047170)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 495, in __init__
+[1;36m(APIServer pid=4047170)[0;0m if not sync_input_socket.poll(timeout=600_000):
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/socket.py", line 1062, in poll
+[1;36m(APIServer pid=4047170)[0;0m evts = dict(p.poll(timeout))
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/zmq/sugar/poll.py", line 106, in poll
+[1;36m(APIServer pid=4047170)[0;0m return zmq_poll(self.sockets, timeout=timeout)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "zmq/backend/cython/_zmq.py", line 1680, in zmq.backend.cython._zmq.zmq_poll
+[1;36m(APIServer pid=4047170)[0;0m File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=4047170)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=4047170)[0;0m KeyboardInterrupt: terminated
+[1;36m(APIServer pid=4047170)[0;0m
+[1;36m(APIServer pid=4047170)[0;0m During handling of the above exception, another exception occurred:
+[1;36m(APIServer pid=4047170)[0;0m
+[1;36m(APIServer pid=4047170)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4047170)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=4047170)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=4047170)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=4047170)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=4047170)[0;0m return runner.run(main)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=4047170)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "uvloop/loop.pyx", line 1512, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4047170)[0;0m File "uvloop/loop.pyx", line 1505, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4047170)[0;0m File "uvloop/loop.pyx", line 1379, in uvloop.loop.Loop.run_forever
+[1;36m(APIServer pid=4047170)[0;0m File "uvloop/loop.pyx", line 557, in uvloop.loop.Loop._run
+[1;36m(APIServer pid=4047170)[0;0m File "uvloop/loop.pyx", line 476, in uvloop.loop.Loop._on_idle
+[1;36m(APIServer pid=4047170)[0;0m File "uvloop/cbhandles.pyx", line 83, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=4047170)[0;0m File "uvloop/cbhandles.pyx", line 61, in uvloop.loop.Handle._run
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=4047170)[0;0m return await main
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=4047170)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=4047170)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4047170)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=4047170)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4047170)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=4047170)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=4047170)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=4047170)[0;0m return cls(
+[1;36m(APIServer pid=4047170)[0;0m ^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=4047170)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=4047170)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=4047170)[0;0m super().__init__(
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 515, in __init__
+[1;36m(APIServer pid=4047170)[0;0m self._finalizer()
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=4047170)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 344, in __call__
+[1;36m(APIServer pid=4047170)[0;0m self.engine_manager.close()
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 141, in close
+[1;36m(APIServer pid=4047170)[0;0m self._finalizer()
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/weakref.py", line 590, in __call__
+[1;36m(APIServer pid=4047170)[0;0m return info.func(*info.args, **(info.kwargs or {}))
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/utils.py", line 315, in shutdown
+[1;36m(APIServer pid=4047170)[0;0m proc.join(remaining)
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 149, in join
+[1;36m(APIServer pid=4047170)[0;0m res = self._popen.wait(timeout)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/popen_fork.py", line 40, in wait
+[1;36m(APIServer pid=4047170)[0;0m if not wait([self.sentinel], timeout):
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
+[1;36m(APIServer pid=4047170)[0;0m ready = selector.select(timeout)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/selectors.py", line 415, in select
+[1;36m(APIServer pid=4047170)[0;0m fd_event_list = self._selector.poll(timeout)
+[1;36m(APIServer pid=4047170)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4047170)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1862, in signal_handler
+[1;36m(APIServer pid=4047170)[0;0m raise KeyboardInterrupt("terminated")
+[1;36m(APIServer pid=4047170)[0;0m KeyboardInterrupt: terminated
diff --git a/code/RL_model/inference_data/old/vllm_server_20260224_213049.log b/code/RL_model/inference_data/old/vllm_server_20260224_213049.log
new file mode 100644
index 0000000000000000000000000000000000000000..887253cfe3bbebfdca562085fe9251269c920d78
--- /dev/null
+++ b/code/RL_model/inference_data/old/vllm_server_20260224_213049.log
@@ -0,0 +1,142 @@
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:30:53 [__init__.py:216] Automatically detected platform cuda.
+WARNING 02-24 21:31:00 [__init__.py:1742] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
+[1;36m(APIServer pid=4055495)[0;0m INFO 02-24 21:31:00 [api_server.py:1839] vLLM API server version 0.11.0
+[1;36m(APIServer pid=4055495)[0;0m INFO 02-24 21:31:00 [utils.py:233] non-default args: {'port': 8001, 'model': '/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', 'dtype': 'bfloat16', 'max_model_len': 16384, 'served_model_name': ['inference'], 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': True, 'max_num_seqs': 256}
+[1;36m(APIServer pid=4055495)[0;0m INFO 02-24 21:31:00 [model.py:547] Resolved architecture: Qwen3ForCausalLM
+[1;36m(APIServer pid=4055495)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
+[1;36m(APIServer pid=4055495)[0;0m INFO 02-24 21:31:00 [model.py:1510] Using max model len 16384
+[1;36m(APIServer pid=4055495)[0;0m INFO 02-24 21:31:00 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
+/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+ import pynvml # type: ignore[import]
+INFO 02-24 21:31:05 [__init__.py:216] Automatically detected platform cuda.
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m INFO 02-24 21:31:11 [core.py:644] Waiting for init message from front-end.
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m INFO 02-24 21:31:11 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', speculative_config=None, tokenizer='/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=inference, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention","vllm.sparse_attn_indexer"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m W0224 21:31:12.558000 4056247 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m W0224 21:31:12.558000 4056247 miniconda3/envs/verl/lib/python3.12/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m INFO 02-24 21:31:13 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] EngineCore failed to start.
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] self._init_executor()
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] raise ValueError(
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ERROR 02-24 21:31:13 [core.py:708] ValueError: Free memory on device (110.1/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m Process EngineCore_DP0:
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m Traceback (most recent call last):
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m self.run()
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/multiprocessing/process.py", line 108, in run
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m self._target(*self._args, **self._kwargs)
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m raise e
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m engine_core = EngineCoreProc(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 498, in __init__
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m super().__init__(vllm_config, executor_class, log_stats,
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 83, in __init__
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m self.model_executor = executor_class(vllm_config)
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 54, in __init__
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m self._init_executor()
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m self.collective_rpc("init_device")
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m return [run_method(self.driver_worker, method, args, kwargs)]
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 3122, in run_method
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m return func(*args, **kwargs)
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 259, in init_device
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m self.worker.init_device() # type: ignore
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py", line 187, in init_device
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m raise ValueError(
+[1;36m(EngineCore_DP0 pid=4056247)[0;0m ValueError: Free memory on device (110.1/139.8 GiB) on startup is less than desired GPU memory utilization (0.95, 132.81 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes.
+[rank0]:[W224 21:31:14.124659903 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
+[1;36m(APIServer pid=4055495)[0;0m Traceback (most recent call last):
+[1;36m(APIServer pid=4055495)[0;0m File "", line 198, in _run_module_as_main
+[1;36m(APIServer pid=4055495)[0;0m File "", line 88, in _run_code
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1953, in
+[1;36m(APIServer pid=4055495)[0;0m uvloop.run(run_server(args))
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 96, in run
+[1;36m(APIServer pid=4055495)[0;0m return __asyncio.run(
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 195, in run
+[1;36m(APIServer pid=4055495)[0;0m return runner.run(main)
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/asyncio/runners.py", line 118, in run
+[1;36m(APIServer pid=4055495)[0;0m return self._loop.run_until_complete(task)
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/uvloop/__init__.py", line 48, in wrapper
+[1;36m(APIServer pid=4055495)[0;0m return await main
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
+[1;36m(APIServer pid=4055495)[0;0m await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
+[1;36m(APIServer pid=4055495)[0;0m async with build_async_engine_client(
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4055495)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
+[1;36m(APIServer pid=4055495)[0;0m async with build_async_engine_client_from_engine_args(
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 210, in __aenter__
+[1;36m(APIServer pid=4055495)[0;0m return await anext(self.gen)
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
+[1;36m(APIServer pid=4055495)[0;0m async_llm = AsyncLLM.from_vllm_config(
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/utils/__init__.py", line 1572, in inner
+[1;36m(APIServer pid=4055495)[0;0m return fn(*args, **kwargs)
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
+[1;36m(APIServer pid=4055495)[0;0m return cls(
+[1;36m(APIServer pid=4055495)[0;0m ^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
+[1;36m(APIServer pid=4055495)[0;0m self.engine_core = EngineCoreClient.make_async_mp_client(
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
+[1;36m(APIServer pid=4055495)[0;0m return AsyncMPClient(*client_args)
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
+[1;36m(APIServer pid=4055495)[0;0m super().__init__(
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 448, in __init__
+[1;36m(APIServer pid=4055495)[0;0m with launch_core_engines(vllm_config, executor_class,
+[1;36m(APIServer pid=4055495)[0;0m ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/contextlib.py", line 144, in __exit__
+[1;36m(APIServer pid=4055495)[0;0m next(self.gen)
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 732, in launch_core_engines
+[1;36m(APIServer pid=4055495)[0;0m wait_for_engine_startup(
+[1;36m(APIServer pid=4055495)[0;0m File "/home/mshahidul/miniconda3/envs/verl/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 785, in wait_for_engine_startup
+[1;36m(APIServer pid=4055495)[0;0m raise RuntimeError("Engine core initialization failed. "
+[1;36m(APIServer pid=4055495)[0;0m RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
diff --git a/code/RL_model/unsloth_rl/RL_code.py b/code/RL_model/unsloth_rl/RL_code.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e0ece1adb2f64457a5431b470d16a28d96011f
--- /dev/null
+++ b/code/RL_model/unsloth_rl/RL_code.py
@@ -0,0 +1,165 @@
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "3"
+from unsloth import FastLanguageModel
+import torch
+from health_classifier import classifier
+max_seq_length = 8192
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+ model_name = "/home/mshahidul/readctrl_model/RL_model/readability_sft_lora_model",
+ max_seq_length = max_seq_length,
+ load_in_4bit = False, # Set to False if you have enough VRAM
+ fast_inference = False,
+)
+
+# Simply enable gradient checkpointing and prepare for training
+model = FastLanguageModel.for_training(model)
+
+# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json
+with open("/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json", "r") as f:
+ import json
+ data = json.load(f)
+from datasets import Dataset
+dataset = Dataset.from_list(data)
+with open('/home/mshahidul/readctrl/code/RL_model/prompt', 'r') as f:
+ prompt_template = f.read()
+dataset = dataset.map(lambda x: {
+ "prompt" : [
+ {"role": "system", "content": prompt_template},
+ {"role": "user", "content": f'''
+- Input Language: English
+- Gold Summary (the anchor reference summary): {x['summary']}
+- Source Text (detailed content): {x['fulltext']}
+'''},
+ ],
+ "answer": {
+ "fulltext_subclaims": x['fulltext_subclaims'],
+ "summary_subclaims": x['summary_subclaims'],
+ },
+})
+import requests
+import json
+import re
+
+from claim_verifier import MedicalClaimVerifier
+
+verifier = MedicalClaimVerifier()
+
+def claim_reward_func(prompts, completions, answer, **kwargs):
+ # import ipdb; ipdb.set_trace()
+ """
+ GRPO reward function.
+ Expects 'summary_subclaims' and 'fulltext_subclaims' to be in the dataset.
+ """
+ rewards = []
+ # We loop through the group of completions
+ for i in range(len(completions)):
+ reward = verifier.get_reward_score(
+ completions[i],
+ answer[i]["summary_subclaims"],
+ answer[i]["fulltext_subclaims"]
+ )
+ rewards.append(reward)
+ return rewards
+
+
+# def format_reward_func(completions, **kwargs):
+# required_keys = ["low_health_literacy", "intermediate_health_literacy", "proficient_health_literacy"]
+# scores = []
+# for completion in completions:
+# try:
+# match = re.search(r"(.*?)", completion, re.DOTALL)
+# content = match.group(1) if match else completion
+# data = json.loads(content)
+# if all(k in data for k in required_keys):
+# scores.append(2.0)
+# else:
+# scores.append(-1.0)
+# except:
+# scores.append(-2.0)
+# return scores
+
+
+import json
+
+def literacy_classifier_reward_func(completions, **kwargs):
+ scores = []
+ for completion in completions:
+ try:
+ # 1. Clean up potential Markdown formatting
+ cleaned_content = completion[0]['content'].strip()
+ if cleaned_content.startswith("```"):
+ # Removes leading ```json or ``` and trailing ```
+ cleaned_content = cleaned_content.split("```")[1]
+ if cleaned_content.startswith("json"):
+ cleaned_content = cleaned_content[4:]
+
+ # 2. Parse the JSON
+ data = json.loads(cleaned_content.strip())
+
+ alignment_score = 0.0
+ target_labels = ["low", "intermediate", "proficient"]
+
+ for label in target_labels:
+ key = f"{label}_health_literacy"
+ text_to_test = data.get(key, "")
+
+
+ if text_to_test:
+ # Run the DSPy classifier
+ result = classifier(summary_text=text_to_test)
+ predicted = result.label # Expected format: "low_health_literacy"
+ # import ipdb; ipdb.set_trace()
+
+ if predicted == key:
+ alignment_score += 1.0
+ else:
+ # Soft penalty for misclassification
+ alignment_score -= 0.5
+ else:
+ # Penalty if a specific literacy level is missing from the JSON
+ alignment_score -= 0.3
+
+ scores.append(alignment_score)
+
+ except (json.JSONDecodeError, Exception):
+ # Significant penalty for malformed JSON or failed processing
+ scores.append(-1.0)
+
+ return scores
+
+
+from trl import GRPOConfig, GRPOTrainer
+
+training_args = GRPOConfig(
+ learning_rate = 5e-6,
+ lr_scheduler_type = "cosine",
+ weight_decay = 0.1,
+ max_prompt_length = 8192,
+ max_completion_length = 4096,
+ # num_of_epochs = 10,
+ num_generations = 4, # GRPO group size
+ per_device_train_batch_size = 4,
+ gradient_accumulation_steps = 4,
+ max_steps = 500,
+ bf16 = True,
+ output_dir = "medical_grpo_outputs",
+)
+
+trainer = GRPOTrainer(
+ model = model,
+ reward_funcs = [
+ claim_reward_func,
+ # format_reward_func,
+ literacy_classifier_reward_func
+ ],
+ args = training_args,
+ train_dataset = dataset, # Use the same dataset from your SFT prep
+ tokenizer = tokenizer,
+)
+
+trainer.train()
+
+model.save_pretrained("/home/mshahidul/readctrl_model/readability_GRPO_model_v1")
+tokenizer.save_pretrained("/home/mshahidul/readctrl_model/readability_GRPO_model_v1")
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/RL_training.ipynb b/code/RL_model/unsloth_rl/RL_training.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..fb0664a14c587c318eb52a73e28e0b6fc63152b6
--- /dev/null
+++ b/code/RL_model/unsloth_rl/RL_training.ipynb
@@ -0,0 +1,475 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8a790cb6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from unsloth import FastLanguageModel\n",
+ "import torch\n",
+ "max_seq_length = 2048 # Can increase for longer reasoning traces\n",
+ "lora_rank = 32 # Larger rank = smarter, but slower\n",
+ "\n",
+ "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+ " model_name = \"unsloth/Qwen3-4B-Base\",\n",
+ " max_seq_length = max_seq_length,\n",
+ " load_in_4bit = False, # False for LoRA 16bit\n",
+ " fast_inference = True, # Enable vLLM fast inference\n",
+ " max_lora_rank = lora_rank,\n",
+ " gpu_memory_utilization = 0.9, # Reduce if out of memory\n",
+ ")\n",
+ "\n",
+ "model = FastLanguageModel.get_peft_model(\n",
+ " model,\n",
+ " r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
+ " target_modules = [\n",
+ " \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+ " \"gate_proj\", \"up_proj\", \"down_proj\",\n",
+ " ],\n",
+ " lora_alpha = lora_rank*2, # *2 speeds up training\n",
+ " use_gradient_checkpointing = \"unsloth\", # Reduces memory usage\n",
+ " random_state = 3407,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba056efa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json\n",
+ "with open('/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json', 'r') as f:\n",
+ " synthetic_data_with_gs_summary_en = json.load(f)\n",
+ "from datasets import Dataset\n",
+ "dataset = Dataset.from_list(synthetic_data_with_gs_summary_en)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fa285d3f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ad059247",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# /home/mshahidul/readctrl/code/RL_model/prompt\n",
+ "with open('/home/mshahidul/readctrl/code/RL_model/prompt', 'r') as f:\n",
+ " prompt_template = f.read()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f74cbfda",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = dataset.map(lambda x: {\n",
+ " \"prompt\" : [\n",
+ " {\"role\": \"system\", \"content\": prompt_template},\n",
+ " {\"role\": \"user\", \"content\": f'''\n",
+ "- Input Language: English\n",
+ "- Gold Summary (the anchor reference summary): {x['summary']}\n",
+ "- Source Text (detailed content): {x['fulltext']}\n",
+ "'''},\n",
+ " ],\n",
+ " \"answer\": {\n",
+ " \"fulltext_subclaims\": x['fulltext_subclaims'],\n",
+ " \"summary_subclaims\": x['summary_subclaims'],\n",
+ " },\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0dd615f4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_20_67.json\n",
+ "import json\n",
+ "with open('/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_80_full.json', 'r') as f:\n",
+ " synthetic_data_diff_labels_en = json.load(f)\n",
+ "full_data=[]\n",
+ "# print((synthetic_data_diff_labels_en)[0].keys())\n",
+ "for item in synthetic_data_diff_labels_en:\n",
+ " texts=item['diff_label_texts']\n",
+ " for label in texts:\n",
+ " full_data.append({\n",
+ " \"index\": item['index'],\n",
+ " 'label': label,\n",
+ " \"original_text\": item['fulltext'],\n",
+ " \"generated_summary\": texts[label]\n",
+ " })\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ba2a6cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('/home/mshahidul/readctrl/data/data_annotator_data/syn_data_diff_labels_en_0_80.json', 'w') as f:\n",
+ " json.dump(full_data, f, indent=4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7cddc461",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# /home/mshahidul/readctrl/data/translated_data/translation_english2bangla_v1.json\n",
+ "import json\n",
+ "with open('/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json', 'r', encoding='utf-8') as f:\n",
+ " dataset = json.load(f)\n",
+ "print(dataset[0].keys())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "2b3f2a96",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0_low_health_literacy\n",
+ "0_intermediate_health_literacy\n",
+ "0_proficient_health_literacy\n",
+ "1_low_health_literacy\n",
+ "1_intermediate_health_literacy\n",
+ "1_proficient_health_literacy\n",
+ "2_low_health_literacy\n",
+ "2_intermediate_health_literacy\n",
+ "2_proficient_health_literacy\n",
+ "3_low_health_literacy\n",
+ "3_intermediate_health_literacy\n",
+ "3_proficient_health_literacy\n",
+ "4_low_health_literacy\n",
+ "4_intermediate_health_literacy\n",
+ "4_proficient_health_literacy\n",
+ "5_low_health_literacy\n",
+ "5_intermediate_health_literacy\n",
+ "5_proficient_health_literacy\n",
+ "6_low_health_literacy\n",
+ "6_intermediate_health_literacy\n",
+ "6_proficient_health_literacy\n",
+ "7_low_health_literacy\n",
+ "7_intermediate_health_literacy\n",
+ "7_proficient_health_literacy\n",
+ "8_low_health_literacy\n",
+ "8_intermediate_health_literacy\n",
+ "8_proficient_health_literacy\n",
+ "9_low_health_literacy\n",
+ "9_intermediate_health_literacy\n",
+ "9_proficient_health_literacy\n",
+ "10_low_health_literacy\n",
+ "10_intermediate_health_literacy\n",
+ "10_proficient_health_literacy\n",
+ "11_low_health_literacy\n",
+ "11_intermediate_health_literacy\n",
+ "11_proficient_health_literacy\n",
+ "12_low_health_literacy\n",
+ "12_intermediate_health_literacy\n",
+ "12_proficient_health_literacy\n",
+ "13_low_health_literacy\n",
+ "13_intermediate_health_literacy\n",
+ "13_proficient_health_literacy\n",
+ "14_low_health_literacy\n",
+ "14_intermediate_health_literacy\n",
+ "14_proficient_health_literacy\n",
+ "15_low_health_literacy\n",
+ "15_intermediate_health_literacy\n",
+ "15_proficient_health_literacy\n",
+ "16_low_health_literacy\n",
+ "16_intermediate_health_literacy\n",
+ "16_proficient_health_literacy\n",
+ "17_low_health_literacy\n",
+ "17_intermediate_health_literacy\n",
+ "17_proficient_health_literacy\n",
+ "18_low_health_literacy\n",
+ "18_intermediate_health_literacy\n",
+ "18_proficient_health_literacy\n",
+ "19_low_health_literacy\n",
+ "19_intermediate_health_literacy\n",
+ "19_proficient_health_literacy\n",
+ "20_low_health_literacy\n",
+ "20_intermediate_health_literacy\n",
+ "20_proficient_health_literacy\n",
+ "21_low_health_literacy\n",
+ "21_intermediate_health_literacy\n",
+ "21_proficient_health_literacy\n",
+ "22_low_health_literacy\n",
+ "22_intermediate_health_literacy\n",
+ "22_proficient_health_literacy\n",
+ "23_low_health_literacy\n",
+ "23_intermediate_health_literacy\n",
+ "23_proficient_health_literacy\n",
+ "24_low_health_literacy\n",
+ "24_intermediate_health_literacy\n",
+ "24_proficient_health_literacy\n",
+ "25_low_health_literacy\n",
+ "25_intermediate_health_literacy\n",
+ "25_proficient_health_literacy\n",
+ "26_low_health_literacy\n",
+ "26_intermediate_health_literacy\n",
+ "26_proficient_health_literacy\n",
+ "27_low_health_literacy\n",
+ "27_intermediate_health_literacy\n",
+ "27_proficient_health_literacy\n",
+ "28_low_health_literacy\n",
+ "28_intermediate_health_literacy\n",
+ "28_proficient_health_literacy\n",
+ "29_low_health_literacy\n",
+ "29_intermediate_health_literacy\n",
+ "29_proficient_health_literacy\n",
+ "30_low_health_literacy\n",
+ "30_intermediate_health_literacy\n",
+ "30_proficient_health_literacy\n",
+ "31_low_health_literacy\n",
+ "31_intermediate_health_literacy\n",
+ "31_proficient_health_literacy\n",
+ "32_low_health_literacy\n",
+ "32_intermediate_health_literacy\n",
+ "32_proficient_health_literacy\n",
+ "33_low_health_literacy\n",
+ "33_intermediate_health_literacy\n",
+ "33_proficient_health_literacy\n",
+ "34_low_health_literacy\n",
+ "34_intermediate_health_literacy\n",
+ "34_proficient_health_literacy\n",
+ "35_low_health_literacy\n",
+ "35_intermediate_health_literacy\n",
+ "35_proficient_health_literacy\n",
+ "36_low_health_literacy\n",
+ "36_intermediate_health_literacy\n",
+ "36_proficient_health_literacy\n",
+ "37_low_health_literacy\n",
+ "37_intermediate_health_literacy\n",
+ "37_proficient_health_literacy\n",
+ "38_low_health_literacy\n",
+ "38_intermediate_health_literacy\n",
+ "38_proficient_health_literacy\n",
+ "39_low_health_literacy\n",
+ "39_intermediate_health_literacy\n",
+ "39_proficient_health_literacy\n",
+ "40_low_health_literacy\n",
+ "40_intermediate_health_literacy\n",
+ "40_proficient_health_literacy\n",
+ "41_low_health_literacy\n",
+ "41_intermediate_health_literacy\n",
+ "41_proficient_health_literacy\n",
+ "42_low_health_literacy\n",
+ "42_intermediate_health_literacy\n",
+ "42_proficient_health_literacy\n",
+ "43_low_health_literacy\n",
+ "43_intermediate_health_literacy\n",
+ "43_proficient_health_literacy\n",
+ "44_low_health_literacy\n",
+ "44_intermediate_health_literacy\n",
+ "44_proficient_health_literacy\n",
+ "45_low_health_literacy\n",
+ "45_intermediate_health_literacy\n",
+ "45_proficient_health_literacy\n",
+ "46_low_health_literacy\n",
+ "46_intermediate_health_literacy\n",
+ "46_proficient_health_literacy\n",
+ "47_low_health_literacy\n",
+ "47_intermediate_health_literacy\n",
+ "47_proficient_health_literacy\n",
+ "48_low_health_literacy\n",
+ "48_intermediate_health_literacy\n",
+ "48_proficient_health_literacy\n",
+ "49_low_health_literacy\n",
+ "49_intermediate_health_literacy\n",
+ "49_proficient_health_literacy\n",
+ "50_low_health_literacy\n",
+ "50_intermediate_health_literacy\n",
+ "50_proficient_health_literacy\n",
+ "51_low_health_literacy\n",
+ "51_intermediate_health_literacy\n",
+ "51_proficient_health_literacy\n",
+ "52_low_health_literacy\n",
+ "52_intermediate_health_literacy\n",
+ "52_proficient_health_literacy\n",
+ "53_low_health_literacy\n",
+ "53_intermediate_health_literacy\n",
+ "53_proficient_health_literacy\n",
+ "54_low_health_literacy\n",
+ "54_intermediate_health_literacy\n",
+ "54_proficient_health_literacy\n",
+ "55_low_health_literacy\n",
+ "55_intermediate_health_literacy\n",
+ "55_proficient_health_literacy\n",
+ "56_low_health_literacy\n",
+ "56_intermediate_health_literacy\n",
+ "56_proficient_health_literacy\n",
+ "57_low_health_literacy\n",
+ "57_intermediate_health_literacy\n",
+ "57_proficient_health_literacy\n",
+ "58_low_health_literacy\n",
+ "58_intermediate_health_literacy\n",
+ "58_proficient_health_literacy\n",
+ "59_low_health_literacy\n",
+ "59_intermediate_health_literacy\n",
+ "59_proficient_health_literacy\n",
+ "60_low_health_literacy\n",
+ "60_intermediate_health_literacy\n",
+ "60_proficient_health_literacy\n",
+ "61_low_health_literacy\n",
+ "61_intermediate_health_literacy\n",
+ "61_proficient_health_literacy\n",
+ "62_low_health_literacy\n",
+ "62_intermediate_health_literacy\n",
+ "62_proficient_health_literacy\n",
+ "63_low_health_literacy\n",
+ "63_intermediate_health_literacy\n",
+ "63_proficient_health_literacy\n",
+ "64_low_health_literacy\n",
+ "64_intermediate_health_literacy\n",
+ "64_proficient_health_literacy\n",
+ "65_low_health_literacy\n",
+ "65_intermediate_health_literacy\n",
+ "65_proficient_health_literacy\n",
+ "66_low_health_literacy\n",
+ "66_intermediate_health_literacy\n",
+ "66_proficient_health_literacy\n",
+ "67_low_health_literacy\n",
+ "67_intermediate_health_literacy\n",
+ "67_proficient_health_literacy\n",
+ "68_low_health_literacy\n",
+ "68_intermediate_health_literacy\n",
+ "68_proficient_health_literacy\n",
+ "69_low_health_literacy\n",
+ "69_intermediate_health_literacy\n",
+ "69_proficient_health_literacy\n",
+ "70_low_health_literacy\n",
+ "70_intermediate_health_literacy\n",
+ "70_proficient_health_literacy\n",
+ "71_low_health_literacy\n",
+ "71_intermediate_health_literacy\n",
+ "71_proficient_health_literacy\n",
+ "72_low_health_literacy\n",
+ "72_intermediate_health_literacy\n",
+ "72_proficient_health_literacy\n",
+ "73_low_health_literacy\n",
+ "73_intermediate_health_literacy\n",
+ "73_proficient_health_literacy\n",
+ "74_low_health_literacy\n",
+ "74_intermediate_health_literacy\n",
+ "74_proficient_health_literacy\n",
+ "75_low_health_literacy\n",
+ "75_intermediate_health_literacy\n",
+ "75_proficient_health_literacy\n",
+ "76_low_health_literacy\n",
+ "76_intermediate_health_literacy\n",
+ "76_proficient_health_literacy\n",
+ "77_low_health_literacy\n",
+ "77_intermediate_health_literacy\n",
+ "77_proficient_health_literacy\n",
+ "78_low_health_literacy\n",
+ "78_intermediate_health_literacy\n",
+ "78_proficient_health_literacy\n",
+ "79_low_health_literacy\n",
+ "79_intermediate_health_literacy\n",
+ "79_proficient_health_literacy\n"
+ ]
+ }
+ ],
+ "source": [
+ "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_80_full_updated.json\n",
+ "with open('/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_80_full_updated.json', 'r') as f:\n",
+ " syn_data_diff_labels_en_0_80_full_updated = json.load(f)\n",
+ "map_data={}\n",
+ "for item in syn_data_diff_labels_en_0_80_full_updated:\n",
+ " for label in list(item['diff_label_texts'].keys()):\n",
+ " key=f\"{item['index']}_{label}\"\n",
+ " print(key)\n",
+ " map_data[key]={\n",
+ " 'doc_id':item['index'],\n",
+ " 'label':label,\n",
+ " 'fulltext':item['fulltext'],\n",
+ " \"diff_label_texts\":item['diff_label_texts'][label],\n",
+ " 'summary':item['summary']\n",
+ " }\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "c52e96ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# /home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/combine/consolidated_ratings_0-20(not_all_category).json\n",
+ "with open('/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/combine/consolidated_ratings_0-20(not_all_category).json', 'r') as f:\n",
+ " consolidated_ratings_0_20 = json.load(f)\n",
+ "new_data=[]\n",
+ "for item in consolidated_ratings_0_20:\n",
+ " key=f\"{item['doc_id']}_{item['health_literacy_label']}\"\n",
+ " new_data.append({\n",
+ " **map_data[key],\n",
+ " })\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "bfd6cf96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/combine/verified_data_0-20.json', 'w') as f:\n",
+ " json.dump(new_data, f, indent=4)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf797af6",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "un",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/code/RL_model/unsloth_rl/claim_verifier.py b/code/RL_model/unsloth_rl/claim_verifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a13d6268330c6e0171c46373e43136fc7363f43
--- /dev/null
+++ b/code/RL_model/unsloth_rl/claim_verifier.py
@@ -0,0 +1,175 @@
+import json
+import re
+import concurrent.futures
+from openai import OpenAI
+
+class MedicalClaimVerifier:
+ def __init__(self):
+ # OpenAI API configuration
+ api_file = "/home/mshahidul/api_new.json"
+ with open(api_file, "r") as f:
+ api_keys = json.load(f)
+ self.api_key = api_keys["openai"]
+ self.model_name = "gpt-5-mini"
+ self.client = OpenAI(api_key=self.api_key)
+
+ # Literacy ranges (IQR after outlier removal) from paper summary
+ # comp = completeness vs gold summary; cov = source_coverage vs full text
+ self.threshold_ranges = {
+ "low": {"comp": (0.9600, 1.0000), "cov": (0.1765, 0.3226)},
+ "intermediate": {"comp": (0.9393, 1.0000), "cov": (0.1818, 0.4091)},
+ "proficient": {"comp": (0.9231, 1.0000), "cov": (0.7725, 0.9347)},
+ }
+
+ # Minimum required information (upper bound of IQR)
+ self.thresholds = {
+ "low": {"comp": 1.0, "cov": 0.3226},
+ "intermediate": {"comp": 1.0, "cov": 0.4091},
+ "proficient": {"comp": 1.0, "cov": 0.9347},
+ }
+
+ def get_prompt(self,context,claim):
+ prompt = f"""
+ CONTEXT:
+ {context}
+
+ CLAIM TO VERIFY:
+ {claim}
+
+ INSTRUCTION:
+ Does the CONTEXT above provide enough evidence to support the CLAIM?
+ - Answer 'supported' if the claim is explicitly stated or logically followable.
+ - Answer 'not_supported' if the claim is missing, contradicts the text, or requires outside info.
+
+ Output only one word: 'supported' or 'not_supported'.
+ """
+ return prompt
+
+ def check_support_api(self, prompt):
+ try:
+ response = self.client.chat.completions.create(
+ model=self.model_name,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ res = response.choices[0].message.content.strip().lower()
+ # print("API Response:", res)
+ return 1.0 if "supported" in res and "not_supported" not in res else 0.0
+ except Exception as e:
+ print(f"API call error: {e}")
+ return 0.0
+
+ def evaluate_level(self, gen_text, gold_subs, full_subs, level_key):
+ """Calculates scores for a single literacy level."""
+ if not gen_text: return 0.0, 0.0
+
+ # Run API calls in parallel to save time during RL
+ try:
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+ # Completeness check (vs Gold Summary Subclaims)
+ comp_prompts = [self.get_prompt(gen_text, s) for s in gold_subs]
+ comp_results = list(executor.map(self.check_support_api, comp_prompts))
+ comp_score = sum(comp_results) / len(comp_results) if comp_results else 0.0
+
+
+ # Coverage check (vs Full Text Subclaims)
+ cov_prompts = [self.get_prompt(gen_text, s) for s in full_subs]
+ cov_results = list(executor.map(self.check_support_api, cov_prompts))
+ cov_score = sum(cov_results) / len(cov_results) if cov_results else 0.0
+ # print(f"Comp Score: {comp_score}, Cov Score: {cov_score} for {level_key}")
+ except Exception as e:
+ print(f"Parallel API call error: {e}")
+ return 0.0, 0.0
+
+ return comp_score, cov_score
+
+ import json
+
+ def get_reward_score(self, completion, gold_subs, full_subs):
+ data = None
+
+ # 1. Robust JSON Extraction
+ try:
+ # Clean potential markdown or whitespace
+ text = completion[0]['content'].strip().replace("```json", "").replace("```", "").strip()
+ data = json.loads(text)
+ except (json.JSONDecodeError, IndexError, ValueError) as e:
+ print("JSON Parsing Error in Reward Calculation")
+ # If all extraction attempts fail
+ return -5.0
+
+ # 2. Schema Validation
+ levels = ["low", "intermediate", "proficient"]
+ # Check if any required keys are missing
+ if not all(f"{lvl}_health_literacy" in data for lvl in levels):
+ return -2.0 # Slightly smaller penalty for partial formatting success
+
+ # 3. Scoring Logic
+ try:
+ total_reward = 0.0
+ pass_reward = 1.0
+ fail_penalty = -1.0
+ for lvl in levels:
+ gen_text = data.get(f"{lvl}_health_literacy", "")
+
+ # Skip scoring if text is empty
+ if not gen_text:
+ total_reward += fail_penalty
+ continue
+
+ comp_score, cov_score = self.evaluate_level(gen_text, gold_subs, full_subs, lvl)
+
+ # Apply Thresholds
+ total_reward += pass_reward if comp_score >= self.thresholds[lvl]["comp"] else fail_penalty
+ total_reward += pass_reward if cov_score >= self.thresholds[lvl]["cov"] else fail_penalty
+
+ return total_reward
+ except Exception:
+ return -5.0
+
+
+# 1. Ground Truth Subclaims (Extracted from a medical paper on Hypertension)
+gold_summary_subclaims = [
+ "Hypertension is defined as blood pressure above 140/90 mmHg.",
+ "Lifestyle changes like low salt intake can reduce blood pressure.",
+ "Diuretics are often the first line of pharmacological treatment."
+]
+
+full_text_subclaims = [
+ "Hypertension is defined as blood pressure above 140/90 mmHg.",
+ "Lifestyle changes like low salt intake can reduce blood pressure.",
+ "Diuretics are often the first line of pharmacological treatment.",
+ "The DASH diet emphasizes fruits, vegetables, and low-fat dairy.",
+ "Chronic hypertension increases the risk of stroke and myocardial infarction.",
+ "ACE inhibitors are contraindicated during pregnancy.",
+ "Secondary hypertension can be caused by renal artery stenosis."
+]
+
+# 2. Mock Model Completion (The output being evaluated)
+# This mimics the format your RL environment would pass to the reward function
+mock_completion = [{
+ 'content': """
+ {
+ "low_health_literacy": "High blood pressure is when your blood is too strong for your veins. You should eat less salt to help stay healthy.",
+ "intermediate_health_literacy": "Hypertension is blood pressure over 140/90. You can lower it by eating less salt and taking water pills (diuretics) if your doctor says so.",
+ "proficient_health_literacy": "Hypertension (BP > 140/90 mmHg) is managed via lifestyle modifications like the DASH diet and salt restriction. Pharmacological interventions include diuretics as first-line therapy, though risks like stroke or heart attack persist if untreated. Secondary causes like renal artery stenosis should be screened, and ACE inhibitors must be avoided in pregnancy."
+ }
+ """
+}]
+
+# Initialize your verifier
+verifier = MedicalClaimVerifier()
+
+# Test the reward calculation
+reward = verifier.get_reward_score(
+ completion=mock_completion,
+ gold_subs=gold_summary_subclaims,
+ full_subs=full_text_subclaims
+)
+
+print(f"--- Evaluation Result ---")
+print(f"Total Reward Score: {reward}")
+
+# Logic Explanation:
+# - Low: Likely fails 'comp' (missing 140/90 info), but might pass 'cov' (low threshold).
+# - Intermediate: Likely passes 'comp' and 'cov'.
+# - Proficient: Needs to cover almost all 7 subclaims to pass the 0.77 coverage threshold.
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/finetune.py b/code/RL_model/unsloth_rl/finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..c454b9ba95b04576f9bc5bf67ef3310e68a91a81
--- /dev/null
+++ b/code/RL_model/unsloth_rl/finetune.py
@@ -0,0 +1,91 @@
+import os
+# Set GPU environment variables
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "2"
+import torch
+from unsloth import FastLanguageModel
+from datasets import load_dataset
+from trl import SFTTrainer, SFTConfig
+from unsloth.chat_templates import get_chat_template, standardize_data_formats, train_on_responses_only
+
+# 1. Configuration
+model_name = "unsloth/Qwen3-4B-Instruct-2507"
+max_seq_length = 8192
+dataset_path = "/home/mshahidul/readctrl/data/finetuning_data/training_data_readability_data_generation.json"
+output_dir = "/home/mshahidul/readctrl_model/RL_model/readability_sft_lora_model"
+
+# 2. Load Model and Tokenizer
+model, tokenizer = FastLanguageModel.from_pretrained(
+ model_name = model_name,
+ max_seq_length = max_seq_length,
+ load_in_4bit = True,
+)
+
+# 3. Add LoRA Adapters
+model = FastLanguageModel.get_peft_model(
+ model,
+ r = 32,
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+ "gate_proj", "up_proj", "down_proj",],
+ lora_alpha = 32,
+ lora_dropout = 0,
+ bias = "none",
+ use_gradient_checkpointing = "unsloth",
+ random_state = 3407,
+)
+
+# 4. Data Preparation
+tokenizer = get_chat_template(
+ tokenizer,
+ chat_template = "qwen3-instruct",
+)
+
+dataset = load_dataset("json", data_files = dataset_path, split = "train")
+dataset = standardize_data_formats(dataset)
+
+def formatting_prompts_func(examples):
+ convos = examples["conversations"]
+ texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
+ return { "text" : texts, }
+
+dataset = dataset.map(formatting_prompts_func, batched = True)
+
+# 5. Training Setup
+trainer = SFTTrainer(
+ model = model,
+ tokenizer = tokenizer,
+ train_dataset = dataset,
+ dataset_text_field = "text",
+ max_seq_length = max_seq_length,
+ args = SFTConfig(
+ per_device_train_batch_size = 2,
+ gradient_accumulation_steps = 4,
+ warmup_steps = 5,
+ # max_steps = 60, # Adjust as needed for your dataset size
+ num_train_epochs = 3,
+ learning_rate = 2e-4,
+ fp16 = not torch.cuda.is_bf16_supported(),
+ bf16 = torch.cuda.is_bf16_supported(),
+ logging_steps = 1,
+ optim = "adamw_8bit",
+ weight_decay = 0.01,
+ lr_scheduler_type = "linear",
+ seed = 3407,
+ output_dir = "outputs",
+ ),
+)
+
+# Train only on assistant responses
+trainer = train_on_responses_only(
+ trainer,
+ instruction_part = "<|im_start|>user\n",
+ response_part = "<|im_start|>assistant\n",
+)
+
+# 6. Train and Save
+trainer.train()
+
+model.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
+
+print(f"Model saved to {output_dir}")
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/health_classifier.py b/code/RL_model/unsloth_rl/health_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..1de86d751fea3ffdc5952ea866113e54d6374471
--- /dev/null
+++ b/code/RL_model/unsloth_rl/health_classifier.py
@@ -0,0 +1,42 @@
+import dspy
+import json
+from typing import Literal
+
+# --- 1. LLM Configuration ---
+def setup_dspy_classifier(save_path, api_key_path):
+ with open(api_key_path, "r") as f:
+ api_keys = json.load(f)
+
+ # Configure the LM
+ # Note: 'gpt-5-mini' is used per your configuration; ensure this matches your provider
+ openai_model = dspy.LM(model='gpt-5-mini', api_key=api_keys["openai"])
+ dspy.configure(lm=openai_model)
+
+ class HealthLiteracySignature(dspy.Signature):
+ """
+ Judge the health literacy level of a generated medical summary.
+ Identify if the language is suitable for a layperson (low) or requires medical expertise (proficient).
+ """
+ summary_text: str = dspy.InputField(desc="The generated medical summary to be analyzed.")
+ reasoning: str = dspy.OutputField(desc="Analysis of jargon, acronyms, and sentence complexity.")
+ label: Literal["low_health_literacy", "intermediate_health_literacy", "proficient_health_literacy"] = dspy.OutputField()
+
+ class HealthLiteracyClassifier(dspy.Module):
+ def __init__(self):
+ super().__init__()
+ self.predictor = dspy.ChainOfThought(HealthLiteracySignature)
+
+ def forward(self, summary_text):
+ return self.predictor(summary_text=summary_text)
+
+ # Initialize and load weights
+ classifier_instance = HealthLiteracyClassifier()
+ classifier_instance.load(save_path)
+ return classifier_instance
+
+# Global instantiation (optional, or you can call setup in your main script)
+API_FILE = "/home/mshahidul/api_new.json"
+SAVE_PATH = "/home/mshahidul/readctrl/data/new_exp/optimized_health_classifier_gpt5-mini_v2.json"
+
+# Create the instance to be imported
+classifier = setup_dspy_classifier(SAVE_PATH, API_FILE)
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/highlighter.py b/code/RL_model/unsloth_rl/highlighter.py
new file mode 100644
index 0000000000000000000000000000000000000000..febe5ab7e544448088c14affc54b4e9ffff632ef
--- /dev/null
+++ b/code/RL_model/unsloth_rl/highlighter.py
@@ -0,0 +1,103 @@
+import gradio as gr
+from transformers import AutoModel
+import torch
+
+# 1. Load the model (ensure you have transformers and torch installed)
+print("Loading model... This may take a moment.")
+model = AutoModel.from_pretrained(
+ "zilliz/semantic-highlight-bilingual-v1",
+ trust_remote_code=True
+)
+
+def process_and_highlight(question, context, threshold):
+ if not question or not context:
+ return "Please provide both a question and context."
+
+ # 2. Run the model inference
+ result = model.process(
+ question=question,
+ context=context,
+ threshold=threshold,
+ return_sentence_metrics=True
+ )
+
+ highlighted_sentences = result.get("highlighted_sentences", [])
+
+ # 3. Create the highlighted HTML output
+ # We iterate through the context and wrap highlighted sentences in HTML tags
+ output_html = context
+
+ # Sort highlighted sentences by length (descending) to avoid partial
+ # matching issues if one sentence is a substring of another
+ highlighted_sentences.sort(key=len, reverse=True)
+
+ for sent in highlighted_sentences:
+ # Use a bright yellow highlight style
+ style = "background-color: #fff176; color: #000; padding: 2px; border-radius: 3px; font-weight: 500;"
+ highlighted_tag = f'{sent}'
+ output_html = output_html.replace(sent, highlighted_tag)
+
+ # Wrap in a container for better typography
+ final_output = f"""
+
+ {output_html}
+
+ """
+
+ # 4. Format metrics for the display
+ metrics_str = "No specific probabilities returned."
+ if "sentence_probabilities" in result:
+ metrics_str = "\n".join([f"• {p:.4f}" for p in result["sentence_probabilities"]])
+
+ return final_output, metrics_str
+
+# 5. Build the Gradio UI
+with gr.Blocks(theme=gr.themes.Soft(), title="Semantic Highlighter") as demo:
+ gr.Markdown("# 🔍 Semantic Highlight Explorer")
+ gr.Markdown("Identify and highlight parts of a text that answer a specific question using the Zilliz bilingual model.")
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ question_input = gr.Textbox(
+ label="Question",
+ placeholder="e.g., What are the symptoms of dehydration?",
+ lines=2
+ )
+ context_input = gr.Textbox(
+ label="Context / Full Text",
+ placeholder="Paste the document text here...",
+ lines=10
+ )
+ threshold_slider = gr.Slider(
+ minimum=0.1, maximum=1.0, value=0.5, step=0.05,
+ label="Confidence Threshold"
+ )
+ submit_btn = gr.Button("Analyze & Highlight", variant="primary")
+
+ with gr.Column(scale=1):
+ gr.Label("Highlighted Result")
+ output_display = gr.HTML()
+
+ with gr.Accordion("Sentence Metrics", open=False):
+ metrics_display = gr.Textbox(label="Probabilities", lines=5)
+
+ # Add example from your snippet
+ gr.Examples(
+ examples=[
+ [
+ "What are the symptoms of dehydration?",
+ "Dehydration occurs when your body loses more fluid than you take in. Common signs include feeling thirsty and having a dry mouth. The human body is composed of about 60% water. Dark yellow urine and infrequent urination are warning signs. Water is essential for many bodily functions. Dizziness, fatigue, and headaches can indicate severe dehydration.",
+ 0.5
+ ]
+ ],
+ inputs=[question_input, context_input, threshold_slider]
+ )
+
+ submit_btn.click(
+ fn=process_and_highlight,
+ inputs=[question_input, context_input, threshold_slider],
+ outputs=[output_display, metrics_display]
+ )
+
+if __name__ == "__main__":
+ demo.launch(share=True)
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/inference.py b/code/RL_model/unsloth_rl/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9171119c7a76f35dea31985559afb6a9ca0d7f20
--- /dev/null
+++ b/code/RL_model/unsloth_rl/inference.py
@@ -0,0 +1,120 @@
+import json
+import os
+# Set GPU environment variables
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "2"
+import torch
+from unsloth import FastLanguageModel
+from transformers import TextStreamer
+
+# 1. Configuration
+model_path = "/home/mshahidul/readctrl_model/RL_model/readability_sft_lora_model"
+max_seq_length = 8192
+
+# 2. Load the Fine-tuned Model and Tokenizer
+# Unsloth automatically reloads the base Qwen3 model and attaches your adapters.
+model, tokenizer = FastLanguageModel.from_pretrained(
+ model_name = model_path,
+ max_seq_length = max_seq_length,
+ load_in_4bit = False,
+)
+
+# 3. Enable Fast Inference
+# This activates Unsloth's optimized inference kernels for a 2x speedup.
+FastLanguageModel.for_inference(model)
+
+# 4. Prepare your Test Data
+# Replace these with actual values from your evaluation set
+gold_summary = "A 34-year-old pregnant woman presents with seizures and dysarthria and is urgently referred for a cranial MRI. The classic ‘Medusa head’ sign is seen and the diagnosis is made as a venous anomaly of development with peripheral partial thrombosis and proximal slow flow.\n"
+fulltext = "We present the case of a 34-year-old woman, eight weeks pregnant with no other personal history of interest, who presents to the emergency department with generalized convulsions with dysarthria in the postcritical period, which resolve progressively in less than two hours. On physical examination, she is conscious, oriented, with no language or motor or sensory deficits. Only signs of a right lateral tongue bite are observed.\n\nThe complementary tests, such as blood tests or the electrocardiogram, are normal. Given that the episode corresponds with a first epileptic seizure and the patient is pregnant, an urgent magnetic resonance of the skull is requested.\n\nThe usual protocol was performed and 3D T1 sequences without and with intravenous contrast were obtained in axial, coronal and sagital planes, axial FLAIR, axial T2, VEN BOLD and magnetic susceptibility sequences, as well as axial diffusion and apparent diffusion coefficient map. The MRI identified multiple venous cortico-medullary vascular structures converging centripetally to a large central venous structure draining through the inferior anastomotic vein into the left transverse sinus, forming the classic ‘Medusa head’ sign. In the T1 sequences, the drainage vein was seen to be increased in signal with central hyphocaptation after contrast administration, suggesting partial thrombosis versus slow flow. In addition, in T2 and FLAIR sequences, the brain tissue surrounding the drainage vein was seen to be hyperintense, without diffusion restriction and compatible with edema.\n\nThese findings are suggestive of a venous anomaly of development with signs of partial peripheral thrombosis and slow flow more proximal, which cause edema of the surrounding tissue. She is started on clexane 60 mg/12 hours and levetiracetam 500 mg/12 hours and the patient shows improvement and symptomatic stability after one week.\n"
+
+
+# Define your exact system prompt
+system_prompt = f"""
+ **System Role:**
+
+ You are an expert medical editor and Health Literacy specialist. Your task is to transform complex medical text into three distinct versions based on the reader's health literacy level. You must maintain the source language of the input while adjusting the linguistic complexity. Use the provided Gold Summary as the factual anchor to ensure the simplified versions remain accurate and focused on the most important information.
+
+ **User Prompt:**
+
+ Please process the following medical Source Text and its corresponding Gold Summary to generate three versions tailored to different health literacy levels.
+ ### Instructions for Each Level:
+
+ 1. Level: Low Health Literacy (High Readability)
+
+ Target: Individuals needing the simplest terms for immediate action.
+
+ Linguistic Goal: Use "living room" language. Replace all medical jargon with functional descriptions (e.g., "renal" becomes "kidney").
+
+ Information Density: Focus strictly on the "need-to-know" info found in the Gold Summary.
+
+ Strategy: High paraphrasing using analogies. One idea per sentence.
+
+ Faithfulness: Must align perfectly with the Gold Summary.
+
+ 2. Level: Intermediate Health Literacy (Medium Readability)
+
+ Target: The general public (news-reading level).
+
+ Linguistic Goal: Standard vocabulary. Common medical terms are okay, but technical "doctor-speak" must be simplified.
+
+ Information Density: Balanced. Use the Gold Summary as the lead, supplemented by necessary context from the Source Text.
+
+ Strategy: Moderate paraphrasing. Remove minor technical details to avoid information overload.
+
+ Faithfulness: Maintains the main narrative of the Gold Summary.
+
+ 3. Level: Proficient Health Literacy (Low Readability)
+
+ Target: Researchers, clinicians, or highly informed patients.
+
+ Linguistic Goal: Technical and academic language. Prioritize clinical nuance and medical accuracy.
+
+ Information Density: High. Use the Full Source Text to include data, physiological mechanisms, and statistics.
+
+ Strategy: Minimal paraphrasing. Retain all original technical terminology.
+
+ Faithfulness: Adhere to the Source Text; you may add related subclaims that provide deeper scientific context.
+
+ Input Language: English
+ Gold Summary (The Anchor):
+ {gold_summary}
+ Source Text (The Detail):
+ {fulltext}
+
+ **Output Format (JSON only):**
+ {{
+ "low_health_literacy": "...",
+ "intermediate_health_literacy": "...",
+ "proficient_health_literacy": "..."
+ }}
+"""
+
+# Format for Qwen-3 instruction template
+messages = [
+ {"role": "user", "content": system_prompt}
+]
+
+input_text = tokenizer.apply_chat_template(
+ messages,
+ tokenize = False,
+ add_generation_prompt = True,
+)
+
+inputs = tokenizer([input_text], return_tensors = "pt").to("cuda")
+
+# 5. Run Generation
+# Using recommended sampling parameters for Qwen3 non-thinking mode.
+text_streamer = TextStreamer(tokenizer, skip_prompt = True,skip_special_tokens = True)
+
+print("--- Model Response ---")
+_ = model.generate(
+ **inputs,
+ streamer = text_streamer,
+ max_new_tokens = 2048,
+ temperature = 0.7,
+ top_p = 0.8,
+ top_k = 20,
+ repetition_penalty = 1.05,
+ use_cache = True,
+)
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/prompt b/code/RL_model/unsloth_rl/prompt
new file mode 100644
index 0000000000000000000000000000000000000000..084bb706dafafee7913a406ccb6fbffa524be840
--- /dev/null
+++ b/code/RL_model/unsloth_rl/prompt
@@ -0,0 +1,58 @@
+**System Role:**
+
+You are an expert medical editor and Health Literacy specialist. Your task is to transform complex medical text into three distinct versions based on the reader's health literacy level. You must maintain the source language of the input while adjusting the linguistic complexity. Use the provided Gold Summary as the factual anchor to ensure the simplified versions remain accurate and focused on the most important information.
+
+**User Prompt:**
+
+Please process the following medical Source Text and its corresponding Gold Summary to generate three versions tailored to different health literacy levels.
+### Instructions for Each Level:
+
+1. Level: Low Health Literacy (High Readability)
+
+Target: Individuals needing the simplest terms for immediate action.
+
+Linguistic Goal: Use "living room" language. Replace all medical jargon with functional descriptions (e.g., "renal" becomes "kidney").
+
+Information Density: Focus strictly on the "need-to-know" info found in the Gold Summary.
+
+Strategy: High paraphrasing using analogies. One idea per sentence.
+
+Faithfulness: Must align perfectly with the Gold Summary.
+
+2. Level: Intermediate Health Literacy (Medium Readability)
+
+Target: The general public (news-reading level).
+
+Linguistic Goal: Standard vocabulary. Common medical terms are okay, but technical "doctor-speak" must be simplified.
+
+Information Density: Balanced. Use the Gold Summary as the lead, supplemented by necessary context from the Source Text.
+
+Strategy: Moderate paraphrasing. Remove minor technical details to avoid information overload.
+
+Faithfulness: Maintains the main narrative of the Gold Summary.
+
+3. Level: Proficient Health Literacy (Low Readability)
+
+Target: Researchers, clinicians, or highly informed patients.
+
+Linguistic Goal: Technical and academic language. Prioritize clinical nuance and medical accuracy.
+
+Information Density: High. Use the Full Source Text to include data, physiological mechanisms, and statistics.
+
+Strategy: Minimal paraphrasing. Retain all original technical terminology.
+
+Faithfulness: Adhere to the Source Text; you may add related subclaims that provide deeper scientific context.
+
+
+I will provide the following information:
+
+- Input Language: <<>>
+- Gold Summary (the anchor reference summary): <<>>
+- Source Text (detailed content): <<>>
+
+**Output Format (JSON only):**
+ {{
+ "low_health_literacy": "...",
+ "intermediate_health_literacy": "...",
+ "proficient_health_literacy": "..."
+ }}
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/reward_mock.py b/code/RL_model/unsloth_rl/reward_mock.py
new file mode 100644
index 0000000000000000000000000000000000000000..370f2b8fe221e36e3881aa42648c0958564698e9
--- /dev/null
+++ b/code/RL_model/unsloth_rl/reward_mock.py
@@ -0,0 +1,127 @@
+import os
+import json
+import re
+import concurrent.futures
+from openai import OpenAI
+
+class MedicalClaimVerifier:
+ def __init__(self):
+ # Implementation remains similar, but with safer error handling
+ api_file = "/home/mshahidul/api_new.json"
+ with open(api_file, "r") as f:
+ api_keys = json.load(f)
+ self.api_key = api_keys["openai"]
+ # Note: Ensure gpt-5-nano is actually available in your tier
+ self.model_name = "gpt-5-nano"
+ self.client = OpenAI(api_key=self.api_key)
+
+ self.thresholds = {
+ "low": {"comp": 1.0, "cov": 0.3226},
+ "intermediate": {"comp": 1.0, "cov": 0.4091},
+ "proficient": {"comp": 1.0, "cov": 0.9347},
+ }
+
+ def get_prompt(self,context,claim):
+ prompt = f"""
+ CONTEXT:
+ {context}
+
+ CLAIM TO VERIFY:
+ {claim}
+
+ INSTRUCTION:
+ Does the CONTEXT above provide enough evidence to support the CLAIM?
+ - Answer 'supported' if the claim is explicitly stated or logically followable.
+ - Answer 'not_supported' if the claim is missing, contradicts the text, or requires outside info.
+
+ Output only one word: 'supported' or 'not_supported'.
+ """
+ return prompt
+
+ def check_support_api(self, prompt):
+ try:
+ response = self.client.chat.completions.create(
+ model=self.model_name,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ res = response.choices[0].message.content.strip().lower()
+ return 1.0 if "supported" in res and "not_supported" not in res else 0.0
+ except Exception:
+ return 0.0
+
+ def evaluate_level(self, gen_text, gold_subs, full_subs):
+ if not gen_text or not gold_subs or not full_subs:
+ return 0.0, 0.0
+
+ # Combining calls to reduce overhead
+ all_claims = gold_subs + full_subs
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+ results = list(executor.map(self.check_support_api, [self.get_prompt(gen_text, s) for s in all_claims]))
+
+ comp_results = results[:len(gold_subs)]
+ cov_results = results[len(gold_subs):]
+
+ comp_score = sum(comp_results) / len(gold_subs)
+ cov_score = sum(cov_results) / len(full_subs)
+ return comp_score, cov_score
+
+verifier = MedicalClaimVerifier()
+
+def compute_score(data_source, solution_str, ground_truth, extra_info=None):
+ gold_subs = ground_truth.get('summary_subclaims', [])
+ full_subs = ground_truth.get('fulltext_subclaims', [])
+
+ if not gold_subs or not full_subs:
+ return 0.0
+
+ # 1. Parsing with fallback
+ try:
+ cleaned_str = solution_str.strip()
+ if "```json" in cleaned_str:
+ cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip()
+ elif "```" in cleaned_str:
+ cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip()
+ data = json.loads(cleaned_str)
+ except Exception:
+ return -5.0
+
+ levels = ["low", "intermediate", "proficient"]
+ scores = {}
+
+ # 2. Score Calculation
+ for lvl in levels:
+ gen_text = data.get(f"{lvl}_health_literacy", "")
+ if not gen_text:
+ scores[lvl] = {"comp": 0.0, "cov": 0.0, "missing": True}
+ else:
+ comp, cov = verifier.evaluate_level(gen_text, gold_subs, full_subs)
+ scores[lvl] = {"comp": comp, "cov": cov, "missing": False}
+
+ # 3. Reward Shaping Logic
+ total_reward = 0.0
+
+ low_cov = scores["low"]["cov"]
+ int_cov = scores["intermediate"]["cov"]
+ pro_cov = scores["proficient"]["cov"]
+
+ # Soft Hierarchy Check: Reward progression, penalize stagnation
+ # Instead of -2.0 exit, we subtract if the order is wrong
+ hierarchy_penalty = 0.0
+ if not (low_cov <= int_cov <= pro_cov):
+ hierarchy_penalty = -2.0
+
+ for lvl in levels:
+ if scores[lvl]["missing"]:
+ total_reward -= 1.0 # Penalty per missing field
+ continue
+
+ comp_s = scores[lvl]["comp"]
+ cov_s = scores[lvl]["cov"]
+ thresh = verifier.thresholds[lvl]
+
+ # Continuous Reward: (Actual - Threshold)
+ # This tells the model "You're 10% away" vs "You failed"
+ total_reward += (comp_s - thresh["comp"])
+ total_reward += (cov_s - thresh["cov"])
+
+ return total_reward + hierarchy_penalty
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/test_reward_mock_unittest.py b/code/RL_model/unsloth_rl/test_reward_mock_unittest.py
new file mode 100644
index 0000000000000000000000000000000000000000..45346d9c2d83ca7fa56c2d80795a3b89f1970e98
--- /dev/null
+++ b/code/RL_model/unsloth_rl/test_reward_mock_unittest.py
@@ -0,0 +1,139 @@
+"""Minimal, offline tests for reward_mock.py.
+
+Run:
+ python code/RL_model/unsloth_rl/test_reward_mock_unittest.py
+
+These tests avoid real OpenAI calls by:
+- mocking the API key file read
+- stubbing OpenAI client construction
+- overriding verifier.evaluate_level to deterministic outputs
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+import types
+import unittest
+from pathlib import Path
+from unittest.mock import mock_open, patch
+
+
+THIS_DIR = Path(__file__).resolve().parent
+REWARD_MOCK_PATH = THIS_DIR / "reward_mock.py"
+
+
+class FakeOpenAI:
+ def __init__(self, api_key: str | None = None, **_kwargs):
+ self.api_key = api_key
+
+
+def load_reward_mock_module():
+ """Load reward_mock.py from its file path under test-friendly patches."""
+ module_name = "reward_mock_under_test"
+ if module_name in sys.modules:
+ del sys.modules[module_name]
+
+ spec = importlib.util.spec_from_file_location(module_name, str(REWARD_MOCK_PATH))
+ if spec is None or spec.loader is None:
+ raise RuntimeError(f"Failed to create import spec for {REWARD_MOCK_PATH}")
+
+ module = importlib.util.module_from_spec(spec)
+
+ # Ensure 'openai' import is available and OpenAI ctor is patched.
+ # reward_mock does: `from openai import OpenAI`
+ with patch("builtins.open", mock_open(read_data='{"openai": "sk-test"}')):
+ with patch("openai.OpenAI", FakeOpenAI):
+ spec.loader.exec_module(module)
+
+ sys.modules[module_name] = module
+ return module
+
+
+class TestRewardMockComputeScore(unittest.TestCase):
+ def test_valid_json_progression_no_hierarchy_penalty(self):
+ rm = load_reward_mock_module()
+
+ def fake_evaluate_level(gen_text, gold_subs, full_subs):
+ # Return (comp, cov) deterministically based on the generated text.
+ if gen_text == "LOW":
+ return 1.0, 0.3000
+ if gen_text == "INTER":
+ return 1.0, 0.4000
+ if gen_text == "PRO":
+ return 1.0, 0.9500
+ return 0.0, 0.0
+
+ rm.verifier.evaluate_level = fake_evaluate_level
+
+ solution_str = """```json
+ {
+ "low_health_literacy": "LOW",
+ "intermediate_health_literacy": "INTER",
+ "proficient_health_literacy": "PRO"
+ }
+ ```"""
+
+ ground_truth = {
+ "summary_subclaims": ["a", "b"],
+ "fulltext_subclaims": ["x", "y", "z"],
+ }
+
+ score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
+
+ # comp thresholds are 1.0 -> comp deltas = 0
+ # cov deltas: (0.3000-0.3226) + (0.4000-0.4091) + (0.9500-0.9347) = -0.0164
+ self.assertAlmostEqual(score, -0.0164, places=4)
+
+ def test_missing_field_penalizes_and_triggers_hierarchy_penalty(self):
+ rm = load_reward_mock_module()
+
+ def fake_evaluate_level(gen_text, gold_subs, full_subs):
+ if gen_text == "LOW":
+ return 1.0, 0.3000
+ if gen_text == "PRO":
+ return 1.0, 0.9500
+ return 0.0, 0.0
+
+ rm.verifier.evaluate_level = fake_evaluate_level
+
+ # intermediate is missing => -1.0
+ # BUT its cov will be 0.0 for the hierarchy check, so low_cov(0.3) <= int_cov(0.0) fails => -2.0
+ solution_str = '{"low_health_literacy": "LOW", "proficient_health_literacy": "PRO"}'
+
+ ground_truth = {
+ "summary_subclaims": ["a"],
+ "fulltext_subclaims": ["x"],
+ }
+
+ score = rm.compute_score(data_source=None, solution_str=solution_str, ground_truth=ground_truth)
+ expected = (0.3000 - 0.3226) + (0.9500 - 0.9347) - 1.0 - 2.0
+ self.assertAlmostEqual(score, expected, places=4)
+
+ def test_invalid_json_returns_minus_five(self):
+ rm = load_reward_mock_module()
+
+ ground_truth = {
+ "summary_subclaims": ["a"],
+ "fulltext_subclaims": ["x"],
+ }
+
+ score = rm.compute_score(data_source=None, solution_str="not a json", ground_truth=ground_truth)
+ self.assertEqual(score, -5.0)
+
+ def test_missing_claims_returns_zero(self):
+ rm = load_reward_mock_module()
+
+ solution_str = '{"low_health_literacy": "LOW", "intermediate_health_literacy": "INTER", "proficient_health_literacy": "PRO"}'
+
+ # Missing subclaims => early return 0.0
+ score = rm.compute_score(
+ data_source=None,
+ solution_str=solution_str,
+ ground_truth={"summary_subclaims": [], "fulltext_subclaims": ["x"]},
+ )
+ self.assertEqual(score, 0.0)
+
+
+if __name__ == "__main__":
+ unittest.main(verbosity=2)
diff --git a/code/RL_model/unsloth_rl/testing.py b/code/RL_model/unsloth_rl/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..a08bd2df41037ae156269e182a474ce0a60ad4d8
--- /dev/null
+++ b/code/RL_model/unsloth_rl/testing.py
@@ -0,0 +1,215 @@
+import json
+import concurrent.futures
+from unittest.mock import MagicMock
+
+# --- The Class (Modified slightly for standalone demo) ---
+
+class MedicalClaimVerifier:
+ def __init__(self, mock_mode=False):
+ self.thresholds = {
+ "low": {"comp": 0.6107, "cov": 0.3723},
+ "intermediate": {"comp": 0.8199, "cov": 0.6611},
+ "proficient": {"comp": 0.9569, "cov": 0.9069}
+ }
+ self.mock_mode = mock_mode
+
+ if not mock_mode:
+ from openai import OpenAI
+ self.api_url = "http://172.16.34.29:8004/v1"
+ self.client = OpenAI(base_url=self.api_url, api_key="EMPTY")
+ self.model_name = "qwen3-32b-readctrl"
+
+ def get_audit_prompt(self, literacy_level):
+ level_guidelines = {
+ "low_health_literacy": """
+ Level: Low Health Literacy (High Readability)
+ Target: Individuals needing simple terms.
+ Goal: 'Living room' language. Replace jargon (e.g., 'renal' -> 'kidney').
+ Density: Strictly 'need-to-know' info from Gold Summary.
+ Strategy: High paraphrasing, analogies, one idea per sentence.
+ Faithfulness: Must align with Gold Summary.""",
+
+ "intermediate_health_literacy": """
+ Level: Intermediate Health Literacy (Medium Readability)
+ Target: General public.
+ Goal: Standard vocabulary. Common medical terms okay; technical speak simplified.
+ Density: Balanced. Use Gold Summary as lead, supplemented by context from Source.
+ Strategy: Moderate paraphrasing. Remove minor technical details.
+ Faithfulness: Maintain main narrative of Gold Summary.""",
+
+ "proficient_health_literacy": """
+ Level: Proficient Health Literacy (Low Readability)
+ Target: Researchers/Clinicians.
+ Goal: Technical/Academic. Prioritize clinical nuance and accuracy.
+ Density: High. Include data, physiological mechanisms, and statistics from Source.
+ Strategy: Minimal paraphrasing. Retain original technical terminology.
+ Faithfulness: Adhere to Source Text; add deeper scientific context."""
+ }
+
+ guidelines = level_guidelines.get(literacy_level, "Follow standard medical audit practices.")
+ level_desc = literacy_level.replace("_", " ")
+
+ base_instructions = f"""
+ ### Literacy Level Context:
+ {guidelines}
+
+ ### Task Instructions:"""
+ return base_instructions
+
+ def get_completeness_prompt(self, generated_text, source_subclaim, literacy_level):
+ base_instructions = self.get_audit_prompt(literacy_level)
+ level_desc = literacy_level.replace("_", " ")
+ return f"""{base_instructions}
+ 1. Determine whether this Fact from the Gold Standard is covered in the {level_desc} summary.
+ 2. Mark 'supported' ONLY IF:
+ - The fact is explicitly stated in the summary, OR
+ - The fact is clearly paraphrased or simplified in a way that preserves its meaning.
+ 3. Do NOT mark 'supported' based solely on omission.
+ - Absence of mention does NOT imply intentional exclusion.
+ - Negative or exclusionary facts (e.g., "no complications," "no family history," "no systemic signs") must be explicitly conveyed.
+ 4. Mark 'not_supported' if:
+ - The fact is completely omitted, OR
+ - The summary discusses related information but does not confirm the specific fact.
+ 5. Literacy-based simplification is allowed, but factual meaning must be preserved.
+
+ SUMMARY: {generated_text}
+ FACT: {source_subclaim}
+
+ output: 'supported' or 'not_supported'.
+ """
+
+ def get_source_coverage_prompt(self, generated_text, source_subclaim, literacy_level):
+ base_instructions = self.get_audit_prompt(literacy_level)
+ level_desc = literacy_level.replace("_", " ")
+ return f"""{base_instructions}
+ 1. Check whether the following Fact from the ORIGINAL Source Text is explicitly covered in the generated {level_desc} summary.
+ 2. Mark 'supported' ONLY IF:
+ - The summary clearly states the fact, OR
+ - The fact is conveyed through an explicit paraphrase or simplification that preserves its meaning.
+ 3. Do NOT infer support from silence or omission.
+ - Absence of mention does NOT count as support.
+ - Especially for negative or exclusionary facts (e.g., "no family history," "no extra-renal signs," "no complications"), the summary must explicitly indicate absence.
+ 4. Mark 'not_supported' if:
+ - The summary omits the fact entirely, OR
+ - The summary discusses related topics but does not clearly confirm the specific fact.
+ 5. Simplification for literacy level is allowed, but factual meaning must be preserved.
+
+ GENERATED SUMMARY: {generated_text}
+ SOURCE FACT: {source_subclaim}
+
+ output: 'supported' or 'not_supported'."""
+
+ def check_support_api(self, prompt):
+ # print(f"Prompt Sent:\n{prompt}\n")
+
+ # Real logic
+ try:
+ response = self.client.chat.completions.create(
+ model=self.model_name,
+ messages=[{"role": "user", "content": prompt}],
+ max_tokens=300, temperature=0.1,
+ )
+ res = response.choices[0].message.content.strip().lower()
+ print(f"Response Received:\n{res}\n")
+ return 1.0 if "supported" in res and "not_supported" not in res else 0.0
+ except:
+ return 0.0
+
+ def evaluate_level(self, gen_text, gold_subs, full_subs, level_key):
+ if not gen_text: return 0.0, 0.0
+
+ # Using 2 workers for demo to avoid overhead
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+ comp_prompts = [self.get_completeness_prompt(gen_text, s, level_key) for s in gold_subs]
+ comp_results = list(executor.map(self.check_support_api, comp_prompts))
+ comp_score = sum(comp_results) / len(comp_results) if comp_results else 0.0
+
+ cov_prompts = [self.get_source_coverage_prompt(gen_text, s, level_key) for s in full_subs]
+ cov_results = list(executor.map(self.check_support_api, cov_prompts))
+ cov_score = sum(cov_results) / len(cov_results) if cov_results else 0.0
+
+ return comp_score, cov_score
+
+ def get_reward_score(self, completion, gold_subs, full_subs):
+ data = None
+ try:
+ # completion[0]['content'] structure as expected by RL frameworks
+ text = completion[0]['content'].strip()
+
+ if "```json" in text:
+ text = text.split("```json")[-1].split("```")[0].strip()
+ elif "```" in text:
+ text = text.split("```")[-1].split("```")[0].strip()
+
+ if "" in text:
+ text = text.split("")[-1].split("")[0].strip()
+
+ data = json.loads(text)
+ except Exception as e:
+ print(f"JSON Parse Error: {e}")
+ return -5.0
+
+ levels = ["low", "intermediate", "proficient"]
+ if not all(f"{lvl}_health_literacy" in data for lvl in levels):
+ return -2.0
+
+ try:
+ total_reward = 0.0
+ print("\n--- Evaluation Breakdown ---")
+ for lvl in levels:
+ gen_text = data.get(f"{lvl}_health_literacy", "")
+ comp_score, cov_score = self.evaluate_level(gen_text, gold_subs, full_subs, f"{lvl}_health_literacy")
+
+ # Logic check
+ comp_passed = comp_score >= self.thresholds[lvl]["comp"]
+ cov_passed = cov_score >= self.thresholds[lvl]["cov"]
+
+ total_reward += 1.0 if comp_passed else -0.5
+ total_reward += 1.0 if cov_passed else -0.5
+
+ print(f"[{lvl.upper()}] Comp: {comp_score:.2f} ({comp_passed}), Cov: {cov_score:.2f} ({cov_passed})")
+
+ return total_reward
+ except Exception as e:
+ print(f"Scoring Error: {e}")
+ return -5.0
+
+# --- Execution Block ---
+
+if __name__ == "__main__":
+ verifier = MedicalClaimVerifier(mock_mode=False)
+
+ # 1. Mock Input Data (what the model generated)
+ pass_completion = [{
+ "content": """
+
+ {
+ "low_health_literacy": "This medicine makes it easier for your heart to pump and relaxes your blood tubes. You might feel dizzy if you stand up too fast.",
+ "intermediate_health_literacy": "ACE inhibitors like Lisinopril relax blood vessels to improve flow and lower heart attack risk. Side effects include low blood pressure.",
+ "proficient_health_literacy": "ACE inhibitors attenuate the effects of stress hormones on the myocardium while inducing vasodilation to reduce afterload and prevent myocardial infarction."
+ }
+
+ """
+ }]
+
+ # Completeness (Essential findings from a Gold Summary)
+ gold_subs = [
+ "ACE inhibitors help the heart pump better.",
+ "These medicines relax blood vessels.",
+ "Common side effects include dizziness and low blood pressure."
+ ]
+
+ # Source Coverage (Detailed facts from the original Full Text)
+ full_subs = [
+ "Lisinopril is an example of an ACE inhibitor.",
+ "ACE inhibitors lower the risk of a heart attack.",
+ "The medication prevents stress hormones from damaging the heart.",
+ "Patients should stand up slowly to avoid dizziness."
+ ]
+
+ # 3. Run Demo
+ print("Starting Demo Run...")
+ final_reward = verifier.get_reward_score(pass_completion, gold_subs, full_subs)
+
+ print("-" * 30)
+ print(f"FINAL REWARD SCORE: {final_reward}")
\ No newline at end of file
diff --git a/code/RL_model/unsloth_rl/testing_v2.py b/code/RL_model/unsloth_rl/testing_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c51b550b9691011ca897b0f13612b5dce34e7f0
--- /dev/null
+++ b/code/RL_model/unsloth_rl/testing_v2.py
@@ -0,0 +1,138 @@
+import json
+import concurrent.futures
+from openai import OpenAI
+
+class FactualityBenchmarker:
+ def __init__(self, api_url="http://172.16.34.29:8004/v1", model="qwen3-32b-readctrl"):
+ self.client = OpenAI(base_url=api_url, api_key="EMPTY")
+ self.model = model
+
+ def verify_claim(self, context, claim):
+ """
+ Asks the model to determine if the context supports the claim.
+ """
+ prompt = f"""
+ CONTEXT:
+ {context}
+
+ CLAIM TO VERIFY:
+ {claim}
+
+ INSTRUCTION:
+ Does the CONTEXT above provide enough evidence to support the CLAIM?
+ - Answer 'supported' if the claim is explicitly stated or logically followable.
+ - Answer 'not_supported' if the claim is missing, contradicts the text, or requires outside info.
+
+ Output only one word: 'supported' or 'not_supported'.
+ """
+
+ try:
+ response = self.client.chat.completions.create(
+ model=self.model,
+ messages=[{"role": "user", "content": prompt}],
+ temperature=0.0, # Zero temp for consistency in benchmarks
+ max_tokens=10
+ )
+ result = response.choices[0].message.content.strip().lower()
+ return "supported" if "supported" in result and "not_supported" not in result else "not_supported"
+ except Exception as e:
+ print(f"Error: {e}")
+ return "not_supported"
+
+ def run_evaluation(self, test_cases):
+ """
+ Runs the benchmark over a list of test cases.
+ Each test case: {"context": "...", "claims": [{"text": "...", "label": 1.0/0.0}]}
+ """
+ total_claims = 0
+ correct_predictions = 0
+
+ print(f"--- Starting Evaluation on {self.model} ---")
+
+ for i, case in enumerate(test_cases):
+ context = case["context"]
+ print(f"\nTest Case {i+1}:")
+
+ for claim_data in case["claims"]:
+ claim_text = claim_data["text"]
+ expected = claim_data["expected"]
+
+ # Model Prediction
+ prediction = self.verify_claim(context, claim_text)
+
+ is_correct = (prediction == expected)
+ if is_correct:
+ correct_predictions += 1
+ total_claims += 1
+
+ status = "PASS" if is_correct else "FAIL"
+ print(f" [{status}] Claim: {claim_text[:60]}... (Expected: {expected}, Got: {prediction})")
+
+ accuracy = (correct_predictions / total_claims) * 100 if total_claims > 0 else 0
+ print(f"\n" + "="*30)
+ print(f"FINAL ACCURACY: {accuracy:.2f}% ({correct_predictions}/{total_claims})")
+ print("="*30)
+
+# --- Define your test data here ---
+test_data = [
+ {
+ "context": """CASE PRESENTATION:
+A 64-year-old male with a 15-year history of Type 2 Diabetes Mellitus and stage 3 chronic kidney disease (CKD)
+presented to the emergency department with acute shortness of breath and peripheral edema. On physical
+examination, the patient was hypertensive (175/95 mmHg) and tachycardic (110 bpm). Lung auscultation revealed
+bilateral crackles in the lower lobes, consistent with pulmonary congestion. Notable laboratory findings
+included a Serum Creatinine of 2.8 mg/dL (baseline 1.9 mg/dL) and a Brain Natriuretic Peptide (BNP) of 1,250 pg/mL.
+
+Crucially, the patient reported no history of tobacco use and denied any chest pain or radiating pain to the
+left arm. An EKG showed sinus tachycardia but no ST-segment elevation or T-wave inversion. The medical team
+initiated a regimen of intravenous furosemide (40mg bolus) and transitioned the patient from his home
+medication (Metformin) to insulin glargine to manage blood glucose during the acute episode, citing concerns
+over lactic acidosis risk given the acute kidney injury. After 48 hours, the patient's oxygen saturation
+improved from 89% on room air to 95%, and his weight decreased by 3.2 kg due to successful diuresis.
+The discharge summary noted that despite the respiratory distress, there were no signs of systemic infection
+or fever during the entire 4-day hospital stay.""",
+ "claims":[
+ # 1. Literal Extraction
+ {"text": "The patient has had Type 2 Diabetes for 15 years.", "expected": "supported"},
+
+ # 2. Medical Paraphrasing (Reading Control)
+ {"text": "The patient showed signs of fluid buildup in the lungs.", "expected": "supported"}, # 'bilateral crackles/congestion'
+
+ # 3. Negative Constraint (Exclusionary fact)
+ {"text": "The patient has a history of smoking.", "expected": "not_supported"}, # Text says 'no history of tobacco'
+
+ # 4. Mathematical Inference
+ {"text": "The patient's Serum Creatinine increased by 0.9 mg/dL from his baseline.", "expected": "supported"}, # 2.8 - 1.9 = 0.9
+
+ # 5. Logic: Cause and Effect
+ {"text": "The doctors stopped Metformin because of the risk of lactic acidosis.", "expected": "supported"},
+
+ # 6. Negative Finding (Testing 'Silence')
+ {"text": "The patient complained of pain moving down his left arm.", "expected": "not_supported"}, # Specifically denied
+
+ # 7. Vital Sign Interpretation
+ {"text": "The patient was experiencing high blood pressure and a fast heart rate upon arrival.", "expected": "supported"}, # 175/95 and 110bpm
+
+ # 8. Numerical Recovery
+ {"text": "The patient lost over 3 kilograms during the first two days of treatment.", "expected": "supported"}, # 3.2 kg
+
+ # 9. Complex Inference (EKG interpretation)
+ {"text": "The EKG provided clear evidence of an active heart attack.", "expected": "not_supported"}, # Text says 'no ST-elevation'
+
+ # 10. Systemic Health Status
+ {"text": "The patient remained afebrile throughout the hospitalization.", "expected": "supported"} # 'no fever' = afebrile
+]
+ },
+ {
+ "context": "The company reported a 15% increase in revenue, reaching $2 billion this quarter. However, net profit dropped due to high R&D costs.",
+ "claims": [
+ {"text": "Revenue reached $2 billion.", "expected": "supported"},
+ {"text": "Net profit increased this quarter.", "expected": "not_supported"},
+ {"text": "Spending on Research and Development impacted profits.", "expected": "supported"}
+ ]
+ }
+]
+
+if __name__ == "__main__":
+ benchmarker = FactualityBenchmarker()
+ benchmarker.run_evaluation(test_data)
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/.gitignore b/code/RL_model/verl/Search-R1/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..be07f884731029d4ced93aa284b0d3ee06b57371
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/.gitignore
@@ -0,0 +1,122 @@
+**/*.pt
+**/checkpoints
+**/wget-log
+**/_build/
+**/*.ckpt
+**/outputs
+**/*.tar.gz
+**/playground
+**/wandb
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+dataset/*
+tensorflow/my_graph/*
+.idea/
+# C extensions
+*.so
+data
+sft/output/*
+sft/data/*
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+image_outputs
+
+checkpoints
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# vscode
+.vscode
+
+# Mac
+.DS_Store
+
+# output logs
+tests/e2e/toy_examples/deepspeed/synchronous/output.txt
+
+# vim
+*.swp
+
+# log*
+log/
+
+**logs
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/LICENSE b/code/RL_model/verl/Search-R1/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/Notice.txt b/code/RL_model/verl/Search-R1/Notice.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ade439da525ac3f82936e131a1ae386f43207fd8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/Notice.txt
@@ -0,0 +1 @@
+Copyright 2023-2024 Bytedance Ltd. and/or its affiliates
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/README.md b/code/RL_model/verl/Search-R1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..86259e3ab90c2a57b459a09584512e62f1189d1a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/README.md
@@ -0,0 +1,275 @@
+# Search-R1: Train your LLMs to reason and call a search engine with reinforcement learning
+
+
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Search-R1** is a reinforcement learning framework designed for training **reasoning-and-searching interleaved LLMs**—language models that learn to reason and make tool calls (e.g., to search engines) in a coordinated manner.
+
+
+Built upon [veRL](https://github.com/volcengine/verl), Search-R1 extends the ideas of **DeepSeek-R1(-Zero)** by incorporating interleaved search engine access and provides a fully open-source RL training pipeline. It serves as an alternative and open solution to **OpenAI DeepResearch**, enabling research and development in tool-augmented LLM reasoning.
+
+
+
+We support different RL methods (e.g., PPO, GRPO, reinforce), different LLMs (e.g., llama3, Qwen2.5, etc) and different search engines (e.g., local sparse/dense retrievers and online search engines).
+
+Paper: [link1](https://arxiv.org/pdf/2503.09516), [link2](https://arxiv.org/abs/2505.15117); Model and data: [link](https://huggingface.co/collections/PeterJinGo/search-r1-67d1a021202731cb065740f5); Twitter thread: [link](https://x.com/BowenJin13/status/1895544294473109889); Full experiment log: [prelim](https://wandb.ai/peterjin/Search-R1-open); [v0.1](https://wandb.ai/peterjin/Search-R1-nq_hotpotqa_train); [v0.2](https://wandb.ai/peterjin/Search-R1-v0.2); [v0.3](https://wandb.ai/peterjin/Search-R1-v0.3). Details about these logs and methods can be find [here](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/experiment_log.md).
+
+
+
+
+## News
+
+- [2025.10] Search-R1 is featured by Thinking Machines Lab's first product [Tinker](https://github.com/thinking-machines-lab/tinker-cookbook)! Details: [Document](https://github.com/thinking-machines-lab/tinker-cookbook/tree/main/tinker_cookbook/recipes/tool_use/search).
+- [2025.7] Search-R1 is supported by [SkyRL](https://github.com/NovaSky-AI/SkyRL)! Detailed instructions: [code](https://github.com/NovaSky-AI/SkyRL/tree/main/skyrl-train/examples/search), [Document](https://novasky-ai.notion.site/skyrl-searchr1).
+- [2025.6] Search-R1 is now integrated into the latest version of veRL and can take advantage of its most up-to-date features! Detailed instructions: [veRL](https://verl.readthedocs.io/en/latest/sglang_multiturn/search_tool_example.html), [English Document](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/tool_examples/verl-multiturn-searchR1-like.md), [Chinese Document](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/tool_examples/verl-multiturn-searchR1-like_ZH.md).
+- [2025.5] The second [paper](https://arxiv.org/abs/2505.15117) conducting detailed empirical studies is published with logs: [v0.3](https://wandb.ai/peterjin/Search-R1-v0.3).
+- [2025.4] We support [multinode](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/multinode.md) training for 30B+ LLMs!
+- [2025.4] We support [different search engines](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/retriever.md) including sparse local retriever, dense local retriever with ANN indexing and online search engines!
+- [2025.3] The first Search-R1 [paper](https://arxiv.org/pdf/2503.09516) is published with the logs: [v0.1](https://wandb.ai/peterjin/Search-R1-nq_hotpotqa_train); [v0.2](https://wandb.ai/peterjin/Search-R1-v0.2).
+- [2025.2] We opensource Search-R1 codebase with [preliminary results](https://wandb.ai/peterjin/Search-R1-open).
+
+## Links
+
+- [Installation](#installation)
+- [Quick start](#quick-start)
+- [Preliminary results](#preliminary-results)
+- [Inference](#inference)
+- [Use your own dataset](#use-your-own-dataset)
+- [Use your own search engine](#use-your-own-search-engine)
+- [Features](#features)
+- [Ackowledge](#acknowledge)
+- [Citations](#citations)
+
+## Installation
+
+### Search-r1 environment
+```bash
+conda create -n searchr1 python=3.9
+conda activate searchr1
+# install torch [or you can skip this step and let vllm to install the correct version for you]
+pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+# install vllm
+pip3 install vllm==0.6.3 # or you can install 0.5.4, 0.4.2 and 0.3.1
+
+# verl
+pip install -e .
+
+# flash attention 2
+pip3 install flash-attn --no-build-isolation
+pip install wandb
+```
+
+### Retriever environment (optional)
+If you would like to call a local retriever as the search engine, you can install the environment as follows. (We recommend using a seperate environment.)
+```bash
+conda create -n retriever python=3.10
+conda activate retriever
+
+# we recommend installing torch with conda for faiss-gpu
+conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia
+pip install transformers datasets pyserini
+
+## install the gpu version faiss to guarantee efficient RL rollout
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0
+
+## API function
+pip install uvicorn fastapi
+```
+
+
+## Quick start
+
+Train a reasoning + search LLM on NQ dataset with e5 as the retriever and wikipedia as the corpus.
+
+(1) Download the indexing and corpus.
+```bash
+save_path=/the/path/to/save
+python scripts/download.py --save_path $save_path
+cat $save_path/part_* > $save_path/e5_Flat.index
+gzip -d $save_path/wiki-18.jsonl.gz
+```
+
+(2) Process the NQ dataset.
+```bash
+python scripts/data_process/nq_search.py
+```
+
+(3) Launch a local retrieval server.
+```bash
+conda activate retriever
+bash retrieval_launch.sh
+```
+
+(4) Run RL training (PPO) with Llama-3.2-3b-base.
+```bash
+conda activate searchr1
+bash train_ppo.sh
+```
+
+## Preliminary results
+
+(1) The base model (llama3.2-3b-base) learns to call the search engine and obtain improved performance.
+
+
+
+
+(2) The base model (Qwen2.5-7b-base) can learn to conduct multi-turn search engine calling and reasoning with RL.
+
+
+
+## Inference
+#### You can play with the trained Search-R1 model with your own question.
+(1) Launch a local retrieval server.
+```bash
+conda activate retriever
+bash retrieval_launch.sh
+```
+
+(2) Run inference.
+```bash
+conda activate searchr1
+python infer.py
+```
+You can modify the ```question``` on line 7 to something you're interested in.
+
+## Use your own dataset
+
+### QA data
+For each question-answer sample, it should be a dictionary containing the desired content as below:
+
+```
+data = {
+ "data_source": data_source,
+ "prompt": [{
+ "role": "user",
+ "content": question,
+ }],
+ "ability": "fact-reasoning",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": solution
+ },
+ "extra_info": {
+ 'split': split,
+ 'index': idx,
+ }
+ }
+```
+
+You can refer to ```scripts/data_process/nq_search.py``` for a concrete data processing example.
+
+### Corpora
+
+It is recommended to make your corpus a jsonl file, where each line (a dictionary with "id" key and "contents" key) corresponds to one passage. You can refer to ```example/corpus.jsonl``` for an example.
+
+The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content ('"' + title + '"\n' + text).
+For example:
+```
+{"id": "0", "contents": "Evan Morris Evan L. Morris (January 26, 1977 \u2013 July 9, 2015) was a lobbyist for Genentech and its parent corporation Roche in Washington."}
+...
+{"id": "100", "contents": "Three years later, when the United States Exploring Expedition to little-known portions of the globe was organised under Charles Wilkes, Hale was recommended, while yet an undergraduate."}
+...
+```
+
+**Index your corpora (optional).**
+If you would like to use a local retriever as the search engine, you can index your own corpus by:
+```
+bash search_r1/search/build_index.sh
+```
+You can change ```retriever_name``` and ```retriever_model``` to your interested off-the-shelf retriever.
+
+## Use your own search engine
+
+Our codebase supports local sparse retriever (e.g., BM25), local dense retriever (both flat indexing with GPUs and ANN indexing with CPUs) and online search engine (e.g., Google, Bing, etc). More details can be found [here](https://github.com/PeterGriffinJin/Search-R1/tree/main/docs/retriever.md).
+
+The main philosophy is to launch a local or remote search engine server separately from the main RL training pipeline.
+
+The LLM can call the search engine by calling the search API (e.g., "http://127.0.0.1:8000/retrieve").
+
+You can refer to ```search_r1/search/retriever_server.py``` for an example of launching a local retriever server.
+
+## Features
+- Support local sparse retrievers (e.g., BM25). ✔️
+- Support local dense retrievers (both flat indexing and ANN indexing) ✔️
+- Support google search / bing search / brave search API and others. ✔️
+- Support off-the-shelf neural rerankers. ✔️
+- Support different RL methods (e.g., PPO, GRPO, reinforce). ✔️
+- Support different LLMs (e.g., llama3, Qwen2.5, etc). ✔️
+
+## Acknowledge
+
+The concept of Search-R1 is inspired by [Deepseek-R1](https://github.com/deepseek-ai/DeepSeek-R1) and [TinyZero](https://github.com/Jiayi-Pan/TinyZero/tree/main).
+Its implementation is built upon [veRL](https://github.com/volcengine/verl) and [RAGEN](https://github.com/ZihanWang314/RAGEN/tree/main).
+We sincerely appreciate the efforts of these teams for their contributions to open-source research and development.
+
+## Awesome work powered or inspired by Search-R1
+
+- [DeepResearcher](https://github.com/GAIR-NLP/DeepResearcher): Scaling Deep Research via Reinforcement Learning in Real-world Environments. [![[code]](https://img.shields.io/github/stars/GAIR-NLP/DeepResearcher)](https://github.com/GAIR-NLP/DeepResearcher)
+- [Multimodal-Search-R1](https://github.com/EvolvingLMMs-Lab/multimodal-search-r1): Incentivizing LMMs to Search. [![[code]](https://img.shields.io/github/stars/EvolvingLMMs-Lab/multimodal-search-r1)](https://github.com/EvolvingLMMs-Lab/multimodal-search-r1)
+- [OTC](https://arxiv.org/pdf/2504.14870): Optimal Tool Calls via Reinforcement Learning.
+- [ZeroSearch](https://github.com/Alibaba-NLP/ZeroSearch): Incentivize the Search Capability of LLMs without Searching. [![[code]](https://img.shields.io/github/stars/Alibaba-NLP/ZeroSearch)](https://github.com/Alibaba-NLP/ZeroSearch)
+- [IKEA](https://github.com/hzy312/knowledge-r1): Reinforced Internal-External Knowledge Synergistic Reasoning for Efficient Adaptive Search Agent. [![[code]](https://img.shields.io/github/stars/hzy312/knowledge-r1)](https://github.com/hzy312/knowledge-r1)
+- [Scent of Knowledge](https://arxiv.org/abs/2505.09316): Optimizing Search-Enhanced Reasoning with Information Foraging.
+- [AutoRefine](https://www.arxiv.org/pdf/2505.11277): Search and Refine During Think. [![[code]](https://img.shields.io/github/stars/syr-cn/AutoRefine)](https://github.com/syr-cn/AutoRefine)
+- [O^2-Searcher](https://arxiv.org/pdf/2505.16582): A Searching-based Agent Model for Open-Domain Open-Ended Question Answering. [![[code]](https://img.shields.io/github/stars/Acade-Mate/O2-Searcher)](https://github.com/Acade-Mate/O2-Searcher)
+- [MaskSearch](https://arxiv.org/pdf/2505.20285): A Universal Pre-Training Framework to Enhance Agentic Search Capability. [![[code]](https://img.shields.io/github/stars/Alibaba-NLP/MaskSearch)](https://github.com/Alibaba-NLP/MaskSearch)
+- [VRAG-RL](https://arxiv.org/abs/2505.22019): Vision-Perception-Based RAG for Visually Rich Information Understanding. [![[code]](https://img.shields.io/github/stars/Alibaba-NLP/VRAG)](https://github.com/Alibaba-NLP/VRAG)
+- [R1-Code-Interpreter](https://arxiv.org/abs/2505.21668): Training LLMs to Reason with Code via SFT and RL. [![[code]](https://img.shields.io/github/stars/yongchao98/R1-Code-Interpreter)](https://github.com/yongchao98/R1-Code-Interpreter)
+- [R-Search](https://arxiv.org/abs/2506.04185): Empowering LLM Reasoning with Search via Multi-Reward Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/QingFei1/R-Search)](https://github.com/QingFei1/R-Search)
+- [StepSearch](https://arxiv.org/pdf/2505.15107): Igniting LLMs Search Ability via Step-Wise Proximal Policy Optimization. [![[code]](https://img.shields.io/github/stars/Zillwang/StepSearch)](https://github.com/Zillwang/StepSearch)
+- [SimpleTIR](https://simpletir.notion.site/report): Stable End-to-End Reinforcement Learning for Multi-Turn Tool-Integrated Reasoning. [![[code]](https://img.shields.io/github/stars/ltzheng/SimpleTIR)](https://github.com/ltzheng/SimpleTIR)
+- [Router-R1](https://arxiv.org/pdf/2506.09033): Teaching LLMs Multi-Round Routing and Aggregation via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/ulab-uiuc/Router-R1)](https://github.com/ulab-uiuc/Router-R1)
+- [SkyRL](https://skyrl.readthedocs.io/en/latest/): A Modular Full-stack RL Library for LLMs. [![[code]](https://img.shields.io/github/stars/NovaSky-AI/SkyRL)](https://github.com/NovaSky-AI/SkyRL)
+- [ASearcher](https://arxiv.org/abs/2508.07976): Large-Scale RL for Search Agents. [![[code]](https://img.shields.io/github/stars/inclusionAI/ASearcher)](https://github.com/inclusionAI/ASearcher)
+- [ParallelSearch](https://www.arxiv.org/abs/2508.09303): Decompose Query and Search Sub-queries in Parallel with RL. [![[code]](https://img.shields.io/github/stars/Tree-Shu-Zhao/ParallelSearch)](https://github.com/Tree-Shu-Zhao/ParallelSearch)
+- [AutoTIR](https://arxiv.org/pdf/2507.21836): Autonomous Tools Integrated Reasoning via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/weiyifan1023/AutoTIR)](https://github.com/weiyifan1023/AutoTIR)
+- [verl-tool](https://arxiv.org/pdf/2509.01055): A version of verl to support diverse tool use. [![[code]](https://img.shields.io/github/stars/TIGER-AI-Lab/verl-tool)](https://github.com/TIGER-AI-Lab/verl-tool)
+- [Tree-GRPO](https://arxiv.org/abs/2509.21240): Tree Search for LLM Agent Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/AMAP-ML/Tree-GRPO)](https://github.com/AMAP-ML/Tree-GRPO)
+- [EviNote-RAG](https://arxiv.org/abs/2509.00877): Enhancing RAG Models via Answer-Supportive Evidence Notes. [![[code]](https://img.shields.io/github/stars/Da1yuqin/EviNoteRAG)](https://github.com/Da1yuqin/EviNoteRAG)
+- [GlobalRAG](https://arxiv.org/pdf/2510.20548v1): GlobalRAG: Enhancing Global Reasoning in Multi-hop Question Answering via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CarnegieBin/GlobalRAG)](https://github.com/CarnegieBin/GlobalRAG)
+
+
+
+
+
+## Citations
+
+```bibtex
+@article{jin2025search,
+ title={Search-r1: Training llms to reason and leverage search engines with reinforcement learning},
+ author={Jin, Bowen and Zeng, Hansi and Yue, Zhenrui and Yoon, Jinsung and Arik, Sercan and Wang, Dong and Zamani, Hamed and Han, Jiawei},
+ journal={arXiv preprint arXiv:2503.09516},
+ year={2025}
+}
+```
+
+```bibtex
+@article{jin2025empirical,
+ title={An Empirical Study on Reinforcement Learning for Reasoning-Search Interleaved LLM Agents},
+ author={Jin, Bowen and Yoon, Jinsung and Kargupta, Priyanka and Arik, Sercan O and Han, Jiawei},
+ journal={arXiv preprint arXiv:2505.15117},
+ year={2025}
+}
+```
diff --git a/code/RL_model/verl/Search-R1/VERL_README.md b/code/RL_model/verl/Search-R1/VERL_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6bc92a6fd3329a1ccdca91c06e2f950b5cd282a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/VERL_README.md
@@ -0,0 +1,103 @@
+veRL: Volcano Engine Reinforcement Learning for LLM
+
+veRL is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs).
+
+veRL is the open-source version of **[HybridFlow: A Flexible and Efficient RLHF Framework](https://arxiv.org/abs/2409.19256v2)** paper.
+
+veRL is flexible and easy to use with:
+
+- **Easy extension of diverse RL algorithms**: The Hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
+
+- **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM and vLLM. Moreover, users can easily extend to other LLM training and inference frameworks.
+
+- **Flexible device mapping**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
+
+- Readily integration with popular HuggingFace models
+
+
+veRL is fast with:
+
+- **State-of-the-art throughput**: By seamlessly integrating existing SOTA LLM training and inference frameworks, veRL achieves high generation and training throughput.
+
+- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
+
+
+| Documentation | Paper | Slack | Wechat |
+
+
+
+
+## News
+
+- [2024/12] The team presented Post-training LLMs: From Algorithms to Infrastructure at NeurIPS 2024. [Slides](https://github.com/eric-haibin-lin/verl-data/tree/neurips) and [video](https://neurips.cc/Expo/Conferences/2024/workshop/100677) available.
+- [2024/10] veRL is presented at Ray Summit. [Youtube video](https://www.youtube.com/watch?v=MrhMcXkXvJU&list=PLzTswPQNepXntmT8jr9WaNfqQ60QwW7-U&index=37) available.
+- [2024/08] HybridFlow (verl) is accepted to EuroSys 2025.
+
+## Key Features
+
+- **FSDP** and **Megatron-LM** for training.
+- **vLLM** and **TGI** for rollout generation, **SGLang** support coming soon.
+- huggingface models support
+- Supervised fine-tuning
+- Reward model training
+- Reinforcement learning from human feedback with PPO
+- flash-attention integration, sequence packing
+- scales up to 70B models and hundreds of GPUs
+- experiment tracking with wandb and mlflow
+
+
+## Getting Started
+
+Checkout this [Jupyter Notebook](https://github.com/volcengine/verl/tree/main/examples/ppo_trainer/verl_getting_started.ipynb) to get started with PPO training with a single 24GB L4 GPU (**FREE** GPU quota provided by [Lighting Studio](https://lightning.ai/hlin-verl/studios/verl-getting-started))!
+
+**Quickstart:**
+- [Installation](https://verl.readthedocs.io/en/latest/start/install.html)
+- [Quickstart](https://verl.readthedocs.io/en/latest/start/quickstart.html)
+
+**Running an PPO example step-by-step:**
+- Data and Reward Preparation
+ - [Prepare Data (Parquet) for Post-Training](https://verl.readthedocs.io/en/latest/preparation/prepare_data.html)
+ - [Implement Reward Function for Dataset](https://verl.readthedocs.io/en/latest/preparation/reward_function.html)
+- Understanding the PPO Example
+ - [PPO Example Architecture](https://verl.readthedocs.io/en/latest/examples/ppo_code_architecture.html)
+ - [Config Explanation](https://verl.readthedocs.io/en/latest/examples/config.html)
+ - [Run GSM8K Example](https://verl.readthedocs.io/en/latest/examples/gsm8k_example.html)
+
+**Reproducible algorithm baselines:**
+- [PPO](https://verl.readthedocs.io/en/latest/experiment/ppo.html)
+
+**For code explanation and advance usage (extension):**
+- PPO Trainer and Workers
+ - [PPO Ray Trainer](https://verl.readthedocs.io/en/latest/workers/ray_trainer.html)
+ - [PyTorch FSDP Backend](https://verl.readthedocs.io/en/latest/workers/fsdp_workers.html)
+ - [Megatron-LM Backend](https://verl.readthedocs.io/en/latest/index.html)
+- Advance Usage and Extension
+ - [Ray API Design Tutorial](https://verl.readthedocs.io/en/latest/advance/placement.html)
+ - [Extend to other RL(HF) algorithms](https://verl.readthedocs.io/en/latest/advance/dpo_extension.html)
+ - [Add models with the FSDP backend](https://verl.readthedocs.io/en/latest/advance/fsdp_extension.html)
+ - [Add models with the Megatron-LM backend](https://verl.readthedocs.io/en/latest/advance/megatron_extension.html)
+
+
+## Citation and acknowledgement
+
+If you find the project helpful, please cite:
+- [HybridFlow: A Flexible and Efficient RLHF Framework](https://arxiv.org/abs/2409.19256v2)
+- [A Framework for Training Large Language Models for Code Generation via Proximal Policy Optimization](https://i.cs.hku.hk/~cwu/papers/gmsheng-NL2Code24.pdf)
+
+```tex
+@article{sheng2024hybridflow,
+ title = {HybridFlow: A Flexible and Efficient RLHF Framework},
+ author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
+ year = {2024},
+ journal = {arXiv preprint arXiv: 2409.19256}
+}
+```
+
+verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and supported by Anyscale, Bytedance, LMSys.org, Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, and University of Hong Kong.
+
+## Publications Using veRL
+- [Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization](https://arxiv.org/abs/2410.09302)
+- [Flaming-hot Initiation with Regular Execution Sampling for Large Language Models](https://arxiv.org/abs/2410.21236)
+- [Process Reinforcement Through Implicit Rewards](https://github.com/PRIME-RL/PRIME/)
+
+We are HIRING! Send us an [email](mailto:haibin.lin@bytedance.com) if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
diff --git a/code/RL_model/verl/Search-R1/infer.py b/code/RL_model/verl/Search-R1/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b93fa84f09b8fc9e6301f41e291c6cec2fb756b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/infer.py
@@ -0,0 +1,128 @@
+import transformers
+import torch
+import random
+from datasets import load_dataset
+import requests
+
+question = "Mike Barnett negotiated many contracts including which player that went on to become general manager of CSKA Moscow of the Kontinental Hockey League?"
+
+# Model ID and device setup
+model_id = "PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-em-ppo"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+question = question.strip()
+if question[-1] != '?':
+ question += '?'
+curr_eos = [151645, 151643] # for Qwen2.5 series models
+curr_search_template = '\n\n{output_text}{search_results}\n\n'
+
+# Prepare the message
+prompt = f"""Answer the given question. \
+You must conduct reasoning inside and first every time you get new information. \
+After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . \
+You can search as many times as your want. \
+If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: {question}\n"""
+
+# Initialize the tokenizer and model
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+model = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
+
+# Define the custom stopping criterion
+class StopOnSequence(transformers.StoppingCriteria):
+ def __init__(self, target_sequences, tokenizer):
+ # Encode the string so we have the exact token-IDs pattern
+ self.target_ids = [tokenizer.encode(target_sequence, add_special_tokens=False) for target_sequence in target_sequences]
+ self.target_lengths = [len(target_id) for target_id in self.target_ids]
+ self._tokenizer = tokenizer
+
+ def __call__(self, input_ids, scores, **kwargs):
+ # Make sure the target IDs are on the same device
+ targets = [torch.as_tensor(target_id, device=input_ids.device) for target_id in self.target_ids]
+
+ if input_ids.shape[1] < min(self.target_lengths):
+ return False
+
+ # Compare the tail of input_ids with our target_ids
+ for i, target in enumerate(targets):
+ if torch.equal(input_ids[0, -self.target_lengths[i]:], target):
+ return True
+
+ return False
+
+def get_query(text):
+ import re
+ pattern = re.compile(r"(.*?)", re.DOTALL)
+ matches = pattern.findall(text)
+ if matches:
+ return matches[-1]
+ else:
+ return None
+
+def search(query: str):
+ payload = {
+ "queries": [query],
+ "topk": 3,
+ "return_scores": True
+ }
+ results = requests.post("http://127.0.0.1:8000/retrieve", json=payload).json()['result']
+
+ def _passages2string(retrieval_result):
+ format_reference = ''
+ for idx, doc_item in enumerate(retrieval_result):
+
+ content = doc_item['document']['contents']
+ title = content.split("\n")[0]
+ text = "\n".join(content.split("\n")[1:])
+ format_reference += f"Doc {idx+1}(Title: {title}) {text}\n"
+ return format_reference
+
+ return _passages2string(results[0])
+
+
+# Initialize the stopping criteria
+target_sequences = ["", " ", "\n", " \n", "\n\n", " \n\n"]
+stopping_criteria = transformers.StoppingCriteriaList([StopOnSequence(target_sequences, tokenizer)])
+
+cnt = 0
+
+if tokenizer.chat_template:
+ prompt = tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True, tokenize=False)
+
+print('\n\n################# [Start Reasoning + Searching] ##################\n\n')
+print(prompt)
+# Encode the chat-formatted prompt and move it to the correct device
+while True:
+ input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
+ attention_mask = torch.ones_like(input_ids)
+
+ # Generate text with the stopping criteria
+ outputs = model.generate(
+ input_ids,
+ attention_mask=attention_mask,
+ max_new_tokens=1024,
+ stopping_criteria=stopping_criteria,
+ pad_token_id=tokenizer.eos_token_id,
+ do_sample=True,
+ temperature=0.7
+ )
+
+ if outputs[0][-1].item() in curr_eos:
+ generated_tokens = outputs[0][input_ids.shape[1]:]
+ output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+ print(output_text)
+ break
+
+ generated_tokens = outputs[0][input_ids.shape[1]:]
+ output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+
+ tmp_query = get_query(tokenizer.decode(outputs[0], skip_special_tokens=True))
+ if tmp_query:
+ # print(f'searching "{tmp_query}"...')
+ search_results = search(tmp_query)
+ else:
+ search_results = ''
+
+ search_text = curr_search_template.format(output_text=output_text, search_results=search_results)
+ prompt += search_text
+ cnt += 1
+ print(search_text)
diff --git a/code/RL_model/verl/Search-R1/misc/docs/experiment_log.md b/code/RL_model/verl/Search-R1/misc/docs/experiment_log.md
new file mode 100644
index 0000000000000000000000000000000000000000..f6db08ba0d99c527bd672e2b9407062aefeb2808
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/docs/experiment_log.md
@@ -0,0 +1,47 @@
+
+## Experiment log
+
+### Preliminary results
+
+Resources: [wandb](https://wandb.ai/peterjin/Search-R1-open)
+
+
+The preliminary experiment is conducted only on natural question (NQ) dataset (+ PPO) with a small number of training steps.
+
+
+### v0.1
+
+Resources: [wandb](https://wandb.ai/peterjin/Search-R1-nq_hotpotqa_train), [docs](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa), [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa/v0.1)
+
+
+We extend the experiments from NQ to seven datasets with both PPO and GRPO methods. The studies are still on a small number of training steps with a big learning rate warm up ratio.
+
+
+### v0.2
+
+Resources: [wandb](https://wandb.ai/peterjin/Search-R1-v0.2), [docs](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa), [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa/v0.2), [paper](https://arxiv.org/abs/2503.09516)
+
+
+We fix several bugs including [retrieved token masking](https://github.com/PeterGriffinJin/Search-R1/pull/21) and [GRPO sample indexing](https://github.com/PeterGriffinJin/Search-R1/commit/9ec2fa9892fbf0315d0c67b4dc08ae8f6cf5f378).
+The former can largely improve the stablity of RL training.
+Then we adjust the training scripts, increasing the number of training steps and decreasing the learning rate warm up ratio, to obtain a better performance, and conduct experiments on different scale of LLMs (3B, 7B, 14B).
+
+
+### v0.3
+
+Resources: [wandb](https://wandb.ai/peterjin/Search-R1-v0.3), [docs](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa), [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa/v0.3), [paper](https://arxiv.org/abs/2505.15117)
+
+We conduct studies on (1) reward design; (2) LLM backbone; and (3) search engine.
+
+- Reward design
+ - Format reward
+ - Intermediate retrieval reward
+- LLM backbone
+ - LLM type (e.g., general LLM or reasoning LLM)
+ - LLM scale (3B/7B/14B/32B)
+- Search engine
+ - RL training dynamics
+ - generalization during inference
+- Data scaling
+
+Details can be found in the [paper](https://arxiv.org/abs/2505.15117).
diff --git a/code/RL_model/verl/Search-R1/misc/docs/multinode.md b/code/RL_model/verl/Search-R1/misc/docs/multinode.md
new file mode 100644
index 0000000000000000000000000000000000000000..14334b21bced2a10785df0337e8f1f97727f6f7c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/docs/multinode.md
@@ -0,0 +1,134 @@
+
+## Multinode Training
+
+Our codebase supports multi-node training for large-scale language models. The implementation is mainly based on [Ray](https://github.com/ray-project/ray).
+
+There are two types of nodes when doing Ray multi-node training: (1) head node and (2) worker nodes.
+There is only one head node where you will start the ray cluster and submit the job.
+The other nodes are worker nodes, where you only need to start and register to the ray cluster.
+
+### Step 1: Set up multinode ray cluster (from [link](https://verl.readthedocs.io/en/latest/start/multinode.html#set-up-multinode-ray-cluster))
+
+a. Start **head** node with ```ray start --head --dashboard-host=0.0.0.0```, there’re 2 address you should care about:
+
+- GCS address: ```ray start --address=```, where **worker** node should connect to.
+
+- Dashboard address: ```:8265```, where you should submit job to the cluster.
+
+
+
+b. Start **worker node** and register it to the ray cluster with ```ray start --address=``` you get above.
+
+
+
+c. Check the cluster status with ```ray status```.
+
+For example, if you have two nodes (each with 8 GPUs) in the cluster, you should see something like this:
+
+
+
+
+### Step 2: Launch the retrieval server on every node.
+
+We would recommend launch the **same** retrieval server on every nodes (including both head and worker nodes) for the stable RL training. Detailed information on how to launch different retrievers can be found as follows: [doc](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/retriever.md) and [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/example/retriever).
+
+For example, if you want to launch the local dense retriever with flat indexing, run the following command on **every** nodes:
+
+```
+bash retrieval_launch.sh
+```
+
+
+### Step 3: Start the job
+
+After the retrievers are launched, you can start the training job. You only need to start the job on the ***head*** node.
+
+An example script is shown as below. Change ```RAY_DASHBOARD_ADDRESS``` and ```N_NODES``` to your dashboard address found in step 1 and the number of nodes respectively.
+
+More script examples can be found [here](https://github.com/PeterGriffinJin/Search-R1/tree/main/example/multinode).
+
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR='data/nq_search'
+
+WAND_PROJECT="Search-R1-release"
+RAY_DASHBOARD_ADDRESS=":8265"
+N_NODES=2
+
+export BASE_MODEL='Qwen/Qwen2.5-7B'
+export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-ppo-qwen2.5-7b-em-multinode-$N_NODES
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS
+
+ulimit -n 65535
+
+ray job submit --address=$RAY_DASHBOARD_ADDRESS \
+ --runtime-env=verl/trainer/runtime_env.yaml \
+ --no-wait \
+ -- \
+ python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=gae \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=False \
+ actor_rollout_ref.rollout.n_agent=1 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.rollout.top_p=1.0 \
+ actor_rollout_ref.actor.state_masking=true \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.optim.lr_warmup_steps_ratio=0.015 \
+ critic.model.path=$BASE_MODEL \
+ critic.model.enable_gradient_checkpointing=true \
+ critic.ppo_micro_batch_size=16 \
+ critic.model.fsdp_config.param_offload=False \
+ critic.model.fsdp_config.grad_offload=False \
+ critic.model.fsdp_config.optimizer_offload=False \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ algorithm.no_think_rl=false \
+ trainer.critic_warmup=0 \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=false \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=$N_NODES \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
+```
diff --git a/code/RL_model/verl/Search-R1/misc/docs/retriever.md b/code/RL_model/verl/Search-R1/misc/docs/retriever.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a475edf77df2f5b1ffca332f0f4be0479f70ec5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/docs/retriever.md
@@ -0,0 +1,128 @@
+
+## Search Engine
+
+In this document, we provide examples of how to launch different retrievers, including local sparse retriever (e.g., BM25), local dense retriever (e.g., e5) and online search engine.
+For local retrievers, we use [wiki-18](https://huggingface.co/datasets/PeterJinGo/wiki-18-corpus) corpus as an example and the corpus indexing can be found at [bm25](https://huggingface.co/datasets/PeterJinGo/wiki-18-bm25-index), [e5-flat](https://huggingface.co/datasets/PeterJinGo/wiki-18-e5-index), [e5-HNSW64](https://huggingface.co/datasets/PeterJinGo/wiki-18-e5-index-HNSW64).
+
+### How to choose the retriever?
+
+- If you have a private or domain-specific corpus, choose **local retriever**.
+
+ - If there is no high quality embedding-based retrievers (dense retrievers) in your domain, choose **sparse local retriever** (e.g., BM25).
+
+ - Otherwise choose **dense local retriever**.
+
+ - If you do not have sufficent GPUs to conduct exact dense embedding matching, choose **ANN indexing** on CPUs.
+
+ - If you have sufficient GPUs, choose **flat indexing** on GPUs.
+
+
+- If you want to train a general LLM search agent and have enough funding, choose **online search engine** (e.g., [SerpAPI](https://serpapi.com/)).
+
+
+- If you have a domain specific online search engine (e.g., PubMed search), you can refer to [link](https://github.com/PeterGriffinJin/Search-R1/blob/main/search_r1/search/serp_search_server.py) to integrate it to Search-R1 by yourself.
+
+Search engine launching scripts can be found at [link](https://github.com/PeterGriffinJin/Search-R1/tree/main/example/retriever).
+
+### Local Sparse Retriever
+
+Sparse retriever (e.g., bm25) is a traditional method. The retrieval process is very efficient and no GPUs are needed. However, it may not be as accurate as dense retrievers in some specific domain.
+
+(1) Download the indexing.
+```bash
+save_path=/your/path/to/save
+huggingface-cli download PeterJinGo/wiki-18-bm25-index --repo-type dataset --local-dir $save_path
+```
+
+(2) Launch a local BM25 retriever server.
+```bash
+conda activate retriever
+
+index_file=$save_path/bm25
+corpus_file=$save_path/wiki-18.jsonl
+retriever_name=bm25
+
+python search_r1/search/retrieval_server.py --index_path $index_file --corpus_path $corpus_file --topk 3 --retriever_name $retriever_name
+```
+
+
+### Local Dense Retriever
+
+You can also adopt some off-the-shelf dense retrievers, e.g., e5. These models are much stronger than sparse retriever in some specific domains.
+If you have sufficient GPU, we would recommend the flat indexing variant below, otherwise you can adopt the ANN variant.
+
+#### Flat indexing
+
+Flat indexing conducts exact embedding match, which is slow but very accurate. To make it efficient enough to support online RL, we would recommend enable **GPU** usage by ```--faiss_gpu```.
+
+(1) Download the indexing and corpus.
+```bash
+save_path=/the/path/to/save
+python scripts/download.py --save_path $save_path
+cat $save_path/part_* > $save_path/e5_Flat.index
+gzip -d $save_path/wiki-18.jsonl.gz
+```
+
+(2) Launch a local flat e5 retriever server.
+
+```bash
+conda activate retriever
+
+index_file=$save_path/e5_Flat.index
+corpus_file=$save_path/wiki-18.jsonl
+retriever_name=e5
+retriever_path=intfloat/e5-base-v2
+
+python search_r1/search/retrieval_server.py --index_path $index_file --corpus_path $corpus_file --topk 3 --retriever_name $retriever_name --retriever_model $retriever_path --faiss_gpu
+
+```
+
+
+#### ANN indexing (HNSW64)
+
+To improve the search efficient with only **CPU**, you can adopt approximate nearest neighbor (ANN) indexing, e.g., with HNSW64.
+It is very efficient, but may not be as accurate as flat indexing, especially when the number of retrieved passages is small.
+
+(1) Download the indexing.
+```bash
+save_path=/the/path/to/save
+huggingface-cli download PeterJinGo/wiki-18-e5-index-HNSW64 --repo-type dataset --local-dir $save_path
+cat $save_path/part_* > $save_path/e5_HNSW64.index
+```
+
+
+(2) Launch a local ANN dense retriever server.
+```bash
+conda activate retriever
+
+index_file=$save_path/e5_HNSW64.index
+corpus_file=$save_path/wiki-18.jsonl
+retriever_name=e5
+retriever_path=intfloat/e5-base-v2
+
+python search_r1/search/retrieval_server.py --index_path $index_file --corpus_path $corpus_file --topk 3 --retriever_name $retriever_name --retriever_model $retriever_path
+```
+
+
+### Online Search Engine
+
+We support both [Google Search API](https://developers.google.com/custom-search/v1/overview) and [SerpAPI](https://serpapi.com/). We would recommend [SerpAPI](https://serpapi.com/) since it integrates multiple online search engine APIs (including Google, Bing, Baidu, etc) and does not have a monthly quota limitation ([Google Search API](https://developers.google.com/custom-search/v1/overview) has a hard 10k monthly quota, which is not sufficient to fulfill online LLM RL training).
+
+#### SerAPI online search server
+
+```bash
+search_url=https://serpapi.com/search
+serp_api_key="" # put your serp api key here (https://serpapi.com/)
+
+python search_r1/search/serp_search_server.py --search_url $search_url --topk 3 --serp_api_key $serp_api_key
+```
+
+#### Google online search server
+
+```bash
+api_key="" # put your google custom API key here (https://developers.google.com/custom-search/v1/overview)
+cse_id="" # put your google cse API key here (https://developers.google.com/custom-search/v1/overview)
+
+python search_r1/search/google_search_server.py --api_key $api_key --topk 5 --cse_id $cse_id --snippet_only
+```
+
diff --git a/code/RL_model/verl/Search-R1/misc/example/case.txt b/code/RL_model/verl/Search-R1/misc/example/case.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a1193b5d58c2981b285922b0dca9a7fc2446fe71
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/case.txt
@@ -0,0 +1,43 @@
+<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+Answer the given question. You must conduct reasoning inside and first every time you get new information. After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . You can search as many times as your want. If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: paul walkers cars in fast and furious list?
+<|im_end|>
+<|im_start|>assistant
+I need to search for Paul Walker's cars in Fast and Furious movies.
+ Paul Walker's cars in Fast and Furious
+
+Doc 1(Title: "Paul Walker") also had a large car collection of about 30 cars, a portion of which he co-owned along with Roger Rodas. The collection included (but is not limited to): Roger Rodas became Walker's financial adviser in 2007 and helped to establish Reach Out Worldwide. Rodas, a pro-am racer was the CEO of Always Evolving, a Valencia high-end vehicle performance shop owned by Walker. Walker was also a close friend of his ""2 Fast 2 Furious"" co-star Tyrese Gibson. Vin Diesel considered Walker to be like a brother, both on and off screen, and affectionately called him ""Pablo"". Walker's mother referred to
+Doc 2(Title: "Paul Walker") Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and ""Varsity Blues"". In 2001, he gained international fame for his role in the street racing action film ""The Fast and the Furious"" (2001), a role he reprised in five of the next six installments, but died in 2013 in the middle of filming ""Furious 7"" (2015). Walker began his career guest-starring on
+Doc 3(Title: "Paul Walker") of Porsche in a separate lawsuit filed by Roger Rodas' widow, Kristine. The ruling had no bearing on two other cases against Porsche which have been filed by Walker's father, who is also the executor of the actor's estate, and his daughter. Walker's father and daughter both reached an agreement with Porsche. Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and
+
+From the information provided, it's clear that Paul Walker was a part of the "Fast and Furious" series, but the specific list of cars is not mentioned. Since I lack this particular detail, I will call a search engine to get the specific list of cars Paul Walker drove in the "Fast and Furious" movies.
+
+ list of cars paul walker drove in Fast and Furious
+
+Doc 1(Title: "Paul Walker") also had a large car collection of about 30 cars, a portion of which he co-owned along with Roger Rodas. The collection included (but is not limited to): Roger Rodas became Walker's financial adviser in 2007 and helped to establish Reach Out Worldwide. Rodas, a pro-am racer was the CEO of Always Evolving, a Valencia high-end vehicle performance shop owned by Walker. Walker was also a close friend of his ""2 Fast 2 Furious"" co-star Tyrese Gibson. Vin Diesel considered Walker to be like a brother, both on and off screen, and affectionately called him ""Pablo"". Walker's mother referred to
+Doc 2(Title: "Paul Walker") Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and ""Varsity Blues"". In 2001, he gained international fame for his role in the street racing action film ""The Fast and the Furious"" (2001), a role he reprised in five of the next six installments, but died in 2013 in the middle of filming ""Furious 7"" (2015). Walker began his career guest-starring on
+Doc 3(Title: "Paul Walker") of Porsche in a separate lawsuit filed by Roger Rodas' widow, Kristine. The ruling had no bearing on two other cases against Porsche which have been filed by Walker's father, who is also the executor of the actor's estate, and his daughter. Walker's father and daughter both reached an agreement with Porsche. Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and
+
+Based on the updated information, it seems the focus was on his career, financials, and family. However, I am still missing the specific list of cars he drove in the "Fast and Furious" movies. Since it appears that the information might not be contained within the accessed documents, and I have no further search queries to make, I will provide an answer based on the details I have.
+
+ Charger
+
+
+
+<|im_start|>system
+You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
+<|im_start|>user
+Answer the given question. You must conduct reasoning inside and first every time you get new information. After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . You can search as many times as your want. If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: the heart muscle is stimulated to contract by electrical impulses which are generated where?
+<|im_end|>
+<|im_start|>assistant
+ I remember that the heart's electrical impulses originate from a specific part of the heart, but I need to confirm which part.
+ origin of heart's electrical impulses
+
+Doc 1(Title: "Cardiac pacemaker") Cardiac pacemaker The contraction of cardiac muscle (heart muscle) in all animals is initiated by electrical impulses known as action potentials. The rate at which these impulses fire controls the rate of cardiac contraction, that is, the heart rate. The cells that create these rhythmic impulses, setting the pace for blood pumping, are called pacemaker cells, and they directly control the heart rate. They make up the cardiac pacemaker, that is, the natural pacemaker of the heart. In most humans, the concentration of pacemaker cells in the sinoatrial (SA) node is the natural pacemaker, and the resultant rhythm is a
+Doc 2(Title: "Electrical conduction system of the heart") Electrical conduction system of the heart The electrical conduction system of the heart transmits signals generated usually by the sinoatrial node to cause contraction of the heart muscle. The pacemaking signal generated in the sinoatrial node travels through the right atrium to the atrioventricular node, along the Bundle of His and through bundle branches to cause contraction of the heart muscle. This signal stimulates contraction first of the right and left atrium, and then the right and left ventricles. This process allows blood to be pumped throughout the body. The conduction system consists of specialised heart muscle cells, and is
+Doc 3(Title: "Sinoatrial node") Sinoatrial node The sinoatrial node (SA node), also known as sinus node, is a group of cells located in the wall of the right atrium of the heart. These cells have the ability to spontaneously produce an electrical impulse (action potential; see below for more details), that travels through the heart via the electrical conduction system (see figure 1) causing it to contract. In a healthy heart, the SA node continuously produces action potential, setting the rhythm of the heart and so is known as the heart's natural pacemaker. The rate of action potential production (and therefore the heart rate)
+
+From the information provided, it is clear that the heart's electrical impulses are generated in the sinoatrial (SA) node, which is often referred to as the heart's natural pacemaker.
+
+ sinoatrial (SA) node
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/misc/example/corpus.jsonl b/code/RL_model/verl/Search-R1/misc/example/corpus.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..fe0fd9a6f06bf7251b9c3fd1be9f40a2b93f0caf
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/corpus.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68b4ff871e19da6e63610517f5af253f118cce301b4d90a6ae007d0a86976491
+size 6399
diff --git a/code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_32b.sh b/code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_32b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7add38592fe9194d02189d59669317316839047e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_32b.sh
@@ -0,0 +1,77 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+RAY_DASHBOARD_ADDRESS="http://xx.xx.xx.xx:8265" # your head node address
+N_NODES=4
+
+export BASE_MODEL='Qwen/Qwen2.5-32B'
+export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-grpo-qwen2.5-32b-em-multinode-${N_NODES}
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+ulimit -n 65535
+
+ray job submit --address=$RAY_DASHBOARD_ADDRESS \
+ --runtime-env=verl/trainer/runtime_env.yaml \
+ --no-wait \
+ -- \
+ python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=grpo \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr=2e-7 \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.use_kl_loss=True \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=false \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=false \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=false \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=false \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ algorithm.no_think_rl=false \
+ actor_rollout_ref.rollout.n_agent=5 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=True \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=false \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=$N_NODES \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_72b.sh b/code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_72b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..100e928fff67fdfdcfdf00ecd1d3924b97d07d4c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_72b.sh
@@ -0,0 +1,75 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+RAY_DASHBOARD_ADDRESS="http://xx.xx.xx.xx:8265" # your head node address
+N_NODES=4
+
+export BASE_MODEL='Qwen/Qwen2.5-72B'
+export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-grpo-qwen2.5-72b-em-multinode-${N_NODES}
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+ulimit -n 65535
+
+ray job submit --address=$RAY_DASHBOARD_ADDRESS \
+ --runtime-env=verl/trainer/runtime_env.yaml \
+ --no-wait \
+ -- \
+ python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=grpo \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr=1e-7 \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.use_kl_loss=True \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=32 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ algorithm.no_think_rl=false \
+ actor_rollout_ref.rollout.n_agent=5 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=True \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=false \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=$N_NODES \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/example/multinode/train_ppo_multinode_32b.sh b/code/RL_model/verl/Search-R1/misc/example/multinode/train_ppo_multinode_32b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0cc93adaf092829158b06a49245bf7026b04fc14
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/multinode/train_ppo_multinode_32b.sh
@@ -0,0 +1,84 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+RAY_DASHBOARD_ADDRESS="http://xx.xx.xx.xx:8265" # your head node address
+N_NODES=4
+
+export BASE_MODEL='Qwen/Qwen2.5-32B'
+export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-ppo-qwen2.5-32b-em-multinode-${N_NODES}
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS
+
+ulimit -n 65535
+
+ray job submit --address=$RAY_DASHBOARD_ADDRESS \
+ --runtime-env=verl/trainer/runtime_env.yaml \
+ --no-wait \
+ -- \
+ python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=gae \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.actor.optim.lr=2e-7 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=32 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=False \
+ actor_rollout_ref.rollout.n_agent=1 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.rollout.top_p=1.0 \
+ actor_rollout_ref.actor.state_masking=true \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.optim.lr_warmup_steps_ratio=0.015 \
+ critic.model.path=$BASE_MODEL \
+ critic.model.enable_gradient_checkpointing=true \
+ critic.ppo_micro_batch_size=32 \
+ critic.model.fsdp_config.param_offload=False \
+ critic.model.fsdp_config.grad_offload=False \
+ critic.model.fsdp_config.optimizer_offload=True \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ algorithm.no_think_rl=false \
+ trainer.critic_warmup=0 \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=$N_NODES \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_ann.sh b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_ann.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f7dc3e7a2b43ef2e5bc84ee340a41be268591cd3
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_ann.sh
@@ -0,0 +1,12 @@
+
+file_path=/the/path/you/save/corpus
+index_file=$file_path/e5_HNSW64.index
+corpus_file=$file_path/wiki-18.jsonl
+retriever_name=e5
+retriever_path=intfloat/e5-base-v2
+
+python search_r1/search/retrieval_server.py --index_path $index_file \
+ --corpus_path $corpus_file \
+ --topk 3 \
+ --retriever_name $retriever_name \
+ --retriever_model $retriever_path
diff --git a/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_bm25.sh b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_bm25.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6c4e1dce623ef6c527743f18289e4e046c4e6b16
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_bm25.sh
@@ -0,0 +1,10 @@
+
+file_path=/the/path/you/save/corpus
+index_file=$file_path/bm25
+corpus_file=$file_path/wiki-18.jsonl
+retriever_name=bm25
+
+python search_r1/search/retrieval_server.py --index_path $index_file \
+ --corpus_path $corpus_file \
+ --topk 3 \
+ --retriever_name $retriever_name
diff --git a/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_google.sh b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_google.sh
new file mode 100644
index 0000000000000000000000000000000000000000..de0090273dfbca17a7a589dc19ca6366cbcb07dc
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_google.sh
@@ -0,0 +1,8 @@
+
+api_key="" # put your google custom API key here (https://developers.google.com/custom-search/v1/overview)
+cse_id="" # put your google cse API key here (https://developers.google.com/custom-search/v1/overview)
+
+python search_r1/search/internal_google_server.py --api_key $api_key \
+ --topk 5 \
+ --cse_id $cse_id \
+ --snippet_only
diff --git a/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_hierarchical.sh b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_hierarchical.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7536b80866094c5560fedf345cbfbb48ad8115cd
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_hierarchical.sh
@@ -0,0 +1,17 @@
+
+file_path=/the/path/you/save/corpus
+index_file=$file_path/e5_Flat.index
+corpus_file=$file_path/wiki-18.jsonl
+retriever_name=e5
+retriever_path=intfloat/e5-base-v2
+reranker_path=cross-encoder/ms-marco-MiniLM-L12-v2
+
+python search_r1/search/retrieval_rerank_server.py --index_path $index_file \
+ --corpus_path $corpus_file \
+ --retrieval_topk 10 \
+ --retriever_name $retriever_name \
+ --retriever_model $retriever_path \
+ --faiss_gpu \
+ --reranking_topk 3 \
+ --reranker_model $reranker_path \
+ --reranker_batch_size 32
diff --git a/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_serpapi.sh b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_serpapi.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c59d0189a99b2029c39bc80126633a517543a7e7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_serpapi.sh
@@ -0,0 +1,7 @@
+
+search_url=https://serpapi.com/search
+serp_api_key="" # put your serp api key here (https://serpapi.com/)
+
+python search_r1/search/online_search_server.py --search_url $search_url \
+ --topk 3 \
+ --serp_api_key $serp_api_key
diff --git a/code/RL_model/verl/Search-R1/misc/public/head.png b/code/RL_model/verl/Search-R1/misc/public/head.png
new file mode 100644
index 0000000000000000000000000000000000000000..86ee00f202e15fb295f5921eb9f561260eb873b8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/head.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a2a8f3ff56836ef01f77c026b59497d4f681ff7d0f21266ca505593ba682403
+size 109219
diff --git a/code/RL_model/verl/Search-R1/misc/public/llama32-3b.png b/code/RL_model/verl/Search-R1/misc/public/llama32-3b.png
new file mode 100644
index 0000000000000000000000000000000000000000..ae89d884b169cad4b352c5a53d480c7fe1bb9afb
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/llama32-3b.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:714caac0fc3a4c1141e8a48f36af00eac26bff94831d3ca9c97cc591ba13ad9f
+size 112678
diff --git a/code/RL_model/verl/Search-R1/misc/public/logo.png b/code/RL_model/verl/Search-R1/misc/public/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f1fcbbdebe6491a8a0d6d90b79f1eb2346c5462
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/logo.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9173f0eb939c124f2cda6fa4fae52e134bcc3d3281cc217ebe36f4fe346f3eb2
+size 1345086
diff --git a/code/RL_model/verl/Search-R1/misc/public/main.png b/code/RL_model/verl/Search-R1/misc/public/main.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce21978fd0f1d302836c334c6f43d62451c5ea40
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/main.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13c26a58d83c919ea3d0391a7954d2cb4667ef7cc45e892a648bc431b40705fd
+size 456505
diff --git a/code/RL_model/verl/Search-R1/misc/public/multi-turn.png b/code/RL_model/verl/Search-R1/misc/public/multi-turn.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa62553828b24109700e02bd79149394cc46c6c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/multi-turn.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9faadbe3f414a8e7458c2e7a753f996e372da3bfe7a3c7b74b72548605e8291b
+size 644091
diff --git a/code/RL_model/verl/Search-R1/misc/public/single-turn.png b/code/RL_model/verl/Search-R1/misc/public/single-turn.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f82f15090f04787ed719d62dcdbf0d6c5e502e3
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/single-turn.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2bec22953dba5593e59814682536a9c75a15d944e137a385ea003216231ec8c
+size 387393
diff --git a/code/RL_model/verl/Search-R1/misc/public/status.png b/code/RL_model/verl/Search-R1/misc/public/status.png
new file mode 100644
index 0000000000000000000000000000000000000000..ea477b730910363d913561ddecb1f8cbfebb749f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/status.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:767391782be9e88c44ad63545662ab78608924c86d580336ed222ee3574c8918
+size 60021
diff --git a/code/RL_model/verl/Search-R1/misc/public/worker.png b/code/RL_model/verl/Search-R1/misc/public/worker.png
new file mode 100644
index 0000000000000000000000000000000000000000..d32de7444de98aad7c4b11b8049614423f3b9571
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/public/worker.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:798acafb35a9feaf847ab36347d38d94820ab6a7aa9f3e2df056d5f37e27f37f
+size 31303
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq.py b/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7fcbf2ae354d9ca2a38805eda842e97a829511f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq.py
@@ -0,0 +1,100 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the nq dataset to parquet format
+"""
+
+import re
+import os
+import datasets
+
+from verl.utils.hdfs_io import copy, makedirs
+import argparse
+
+
+def make_prefix(dp, template_type):
+ question = dp['question']
+
+ # NOTE: also need to change reward_score/countdown.py
+ if template_type == 'base':
+ """This works for any base model"""
+ prefix = f"""Answer the given question. \
+You should first have a reasoning process in mind and then provides the answer. \
+Show your reasoning in tags and return the final answer in tags, for example Beijing . \
+Question: {question}\n"""
+ else:
+ raise NotImplementedError
+ return prefix
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--local_dir', default='./data/nq')
+ parser.add_argument('--hdfs_dir', default=None)
+ parser.add_argument('--template_type', type=str, default='base')
+
+ args = parser.parse_args()
+
+ data_source = 'nq'
+
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq')
+
+ train_dataset = dataset['train']
+ test_dataset = dataset['test']
+
+ # add a row to each data item that represents a unique id
+ def make_map_fn(split):
+
+ def process_fn(example, idx):
+ example['question'] = example['question'].strip()
+ if example['question'][-1] != '?':
+ example['question'] += '?'
+ question = make_prefix(example, template_type=args.template_type)
+ solution = {
+ "target": example['golden_answers'],
+ }
+
+ data = {
+ "data_source": data_source,
+ "prompt": [{
+ "role": "user",
+ "content": question,
+ }],
+ "ability": "fact-reasoning",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": solution
+ },
+ "extra_info": {
+ 'split': split,
+ 'index': idx,
+ }
+ }
+ return data
+
+ return process_fn
+
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
+
+ local_dir = args.local_dir
+ hdfs_dir = args.hdfs_dir
+
+ train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
+ test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
+
+ if hdfs_dir is not None:
+ makedirs(hdfs_dir)
+
+ copy(src=local_dir, dst=hdfs_dir)
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_rag.py b/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_rag.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ce77376ffbbf0fe7e72809070e225f50e2033eb
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_rag.py
@@ -0,0 +1,141 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the nq dataset to parquet format
+"""
+
+import re
+import os
+import json
+import datasets
+
+from verl.utils.hdfs_io import copy, makedirs
+import argparse
+
+
+def make_prefix(dp, template_type):
+ question = dp['question']
+ context = dp['context']
+
+ # NOTE: also need to change reward_score/countdown.py
+ if template_type == 'base':
+ """This works for any base model"""
+ prefix = f"""Answer the given question with some potentially useful context. \
+You should analyze the question carefully, evaluate the given context (which may or may not be useful), and then generate an accurate and well-reasoned response. \
+You should first have a reasoning process in mind and then provides the answer. \
+Show your reasoning in tags and return the final answer in tags, for example Beijing . \
+Question: {question} Context: {context} \n"""
+ else:
+ raise NotImplementedError
+ return prefix
+
+
+def format_reference(retrieval_result):
+ format_reference = ''
+ for idx, doc_item in enumerate(retrieval_result):
+ content = doc_item['contents']
+ title = content.split("\n")[0]
+ text = "\n".join(content.split("\n")[1:])
+ format_reference += f"Doc {idx+1}(Title: {title}) {text}\n"
+
+ return format_reference
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--local_dir', default='./data/nq_rag')
+ parser.add_argument('--hdfs_dir', default=None)
+ parser.add_argument('--template_type', type=str, default='base')
+ parser.add_argument('--topk', type=int, default=3)
+ parser.add_argument('--corpus_path', type=str, default='/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl')
+ parser.add_argument('--train_retrieval_cache', type=str, default='/home/peterjin/rag_retrieval_cache/nq/e5_train_retrieval_cache_2048.json')
+ parser.add_argument('--test_retrieval_cache', type=str, default='/home/peterjin/rag_retrieval_cache/nq/e5_test_retrieval_cache_10000.json')
+
+ args = parser.parse_args()
+
+ data_source = 'nq'
+
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq')
+
+ train_dataset = dataset['train']
+ test_dataset = dataset['test']
+
+ # read retrieval cache
+ print('reading retrieval cache...')
+ retrieval_cache = json.load(open(args.train_retrieval_cache))
+ # test_retrieval_cache = json.load(open(args.test_retrieval_cache))
+ retrieval_cache.update(json.load(open(args.test_retrieval_cache)))
+
+ # read corpus
+ print('reading corpus...')
+ corpus = {}
+ with open(args.corpus_path) as f:
+ readin = f.readlines()
+ for line in readin:
+ tmp = json.loads(line)
+ corpus[tmp['id']] = tmp
+
+ # add a column for the retrieval context
+ def add_context(example):
+ example['context'] = format_reference([corpus[docs["id"]] for docs in retrieval_cache[example['question']][:args.topk]])
+ return example
+
+ train_dataset = train_dataset.map(function=add_context)
+ test_dataset = test_dataset.map(function=add_context)
+
+ # add a row to each data item that represents a unique id
+ def make_map_fn(split):
+
+ def process_fn(example, idx):
+ example['question'] = example['question'].strip()
+ if example['question'][-1] != '?':
+ example['question'] += '?'
+ question = make_prefix(example, template_type=args.template_type)
+ solution = {
+ "target": example['golden_answers'],
+ }
+
+ data = {
+ "data_source": data_source,
+ "prompt": [{
+ "role": "user",
+ "content": question,
+ }],
+ "ability": "fact-reasoning",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": solution
+ },
+ "extra_info": {
+ 'split': split,
+ 'index': idx,
+ }
+ }
+ return data
+
+ return process_fn
+
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
+
+ local_dir = args.local_dir
+ hdfs_dir = args.hdfs_dir
+
+ train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
+ test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
+
+ if hdfs_dir is not None:
+ makedirs(hdfs_dir)
+
+ copy(src=local_dir, dst=hdfs_dir)
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_search.py b/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d9e04561eee70dc4bf20713b4666cb00f424669
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_search.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the nq dataset to parquet format
+"""
+
+import re
+import os
+import datasets
+
+from verl.utils.hdfs_io import copy, makedirs
+import argparse
+
+
+def make_prefix(dp, template_type):
+ question = dp['question']
+
+ # NOTE: also need to change reward_score/countdown.py
+ if template_type == 'base':
+ """This works for any base model"""
+ prefix = f"""Answer the given question. \
+You must conduct reasoning inside and first every time you get new information. \
+After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . \
+You can search as many times as your want. \
+If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: {question}\n"""
+ else:
+ raise NotImplementedError
+ return prefix
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--local_dir', default='./data/nq_search')
+ parser.add_argument('--hdfs_dir', default=None)
+ parser.add_argument('--template_type', type=str, default='base')
+
+ args = parser.parse_args()
+
+ data_source = 'nq'
+
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq')
+
+ train_dataset = dataset['train']
+ test_dataset = dataset['test']
+
+ # add a row to each data item that represents a unique id
+ def make_map_fn(split):
+
+ def process_fn(example, idx):
+ example['question'] = example['question'].strip()
+ if example['question'][-1] != '?':
+ example['question'] += '?'
+ question = make_prefix(example, template_type=args.template_type)
+ solution = {
+ "target": example['golden_answers'],
+ }
+
+ data = {
+ "data_source": data_source,
+ "prompt": [{
+ "role": "user",
+ "content": question,
+ }],
+ "ability": "fact-reasoning",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": solution
+ },
+ "extra_info": {
+ 'split': split,
+ 'index': idx,
+ }
+ }
+ return data
+
+ return process_fn
+
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
+
+ local_dir = args.local_dir
+ hdfs_dir = args.hdfs_dir
+
+ train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
+ test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
+
+ if hdfs_dir is not None:
+ makedirs(hdfs_dir)
+
+ copy(src=local_dir, dst=hdfs_dir)
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_test_merge.py b/code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_test_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc98b81511824b445c392fe3e5462829ec28463
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_test_merge.py
@@ -0,0 +1,115 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the QA dataset to parquet format
+"""
+
+import re
+import os
+import datasets
+
+from verl.utils.hdfs_io import copy, makedirs
+import argparse
+
+
+def make_prefix(dp, template_type):
+ question = dp['question']
+
+ # NOTE: also need to change reward_score/countdown.py
+ if template_type == 'base':
+ """This works for any base model"""
+ prefix = f"""Answer the given question. \
+You must conduct reasoning inside and first every time you get new information. \
+After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . \
+You can search as many times as your want. \
+If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: {question}\n"""
+ else:
+ raise NotImplementedError
+ return prefix
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--local_dir', default='./data/nq_search')
+ parser.add_argument('--hdfs_dir', default=None)
+ parser.add_argument('--template_type', type=str, default='base')
+ parser.add_argument('--data_sources', default='nq')
+
+ args = parser.parse_args()
+
+ data_sources = args.data_sources.split(',')
+ all_dataset = []
+
+ for data_source in data_sources:
+
+ if data_source != 'strategyqa':
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', data_source)
+ else:
+ dataset = datasets.load_dataset('json', data_files="/home/peterjin/mnt/data/strategyqa/test_correct.jsonl")
+
+ if 'test' in dataset:
+ print(f'Using the {data_source} test dataset...')
+ test_dataset = dataset['test']
+ elif 'dev' in dataset:
+ print(f'Using the {data_source} dev dataset...')
+ test_dataset = dataset['dev']
+ else:
+ print(f'Using the {data_source} train dataset...')
+ test_dataset = dataset['train']
+
+ # add a row to each data item that represents a unique id
+ def make_map_fn(split):
+
+ def process_fn(example, idx):
+ example['question'] = example['question'].strip()
+ if example['question'][-1] != '?':
+ example['question'] += '?'
+ question = make_prefix(example, template_type=args.template_type)
+ solution = {
+ "target": example['golden_answers'],
+ }
+
+ data = {
+ "data_source": data_source,
+ "prompt": [{
+ "role": "user",
+ "content": question,
+ }],
+ "ability": "fact-reasoning",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": solution
+ },
+ "extra_info": {
+ 'split': split,
+ 'index': idx,
+ }
+ }
+ return data
+
+ return process_fn
+
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
+ all_dataset.append(test_dataset)
+
+ local_dir = args.local_dir
+ hdfs_dir = args.hdfs_dir
+
+ all_test_dataset = datasets.concatenate_datasets(all_dataset)
+ all_test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
+
+ if hdfs_dir is not None:
+ makedirs(hdfs_dir)
+
+ copy(src=local_dir, dst=hdfs_dir)
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_train_merge.py b/code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_train_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac8de657b97bff9b084dcd718edfcfab9201b2b5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_train_merge.py
@@ -0,0 +1,105 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the QA dataset to parquet format
+"""
+
+import re
+import os
+import datasets
+
+from verl.utils.hdfs_io import copy, makedirs
+import argparse
+
+
+def make_prefix(dp, template_type):
+ question = dp['question']
+
+ # NOTE: also need to change reward_score/countdown.py
+ if template_type == 'base':
+ """This works for any base model"""
+ prefix = f"""Answer the given question. \
+You must conduct reasoning inside and first every time you get new information. \
+After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . \
+You can search as many times as your want. \
+If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: {question}\n"""
+ else:
+ raise NotImplementedError
+ return prefix
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--local_dir', default='./data/nq_search')
+ parser.add_argument('--hdfs_dir', default=None)
+ parser.add_argument('--template_type', type=str, default='base')
+ parser.add_argument('--data_sources', default='nq')
+
+ args = parser.parse_args()
+
+ # data_source = 'nq'
+ data_sources = args.data_sources.split(',')
+ all_dataset = []
+
+ for data_source in data_sources:
+
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', data_source)
+
+ train_dataset = dataset['train']
+
+ # add a row to each data item that represents a unique id
+ def make_map_fn(split):
+
+ def process_fn(example, idx):
+ example['question'] = example['question'].strip()
+ if example['question'][-1] != '?':
+ example['question'] += '?'
+ question = make_prefix(example, template_type=args.template_type)
+ solution = {
+ "target": example['golden_answers'],
+ }
+
+ data = {
+ "data_source": data_source,
+ "prompt": [{
+ "role": "user",
+ "content": question,
+ }],
+ "ability": "fact-reasoning",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": solution
+ },
+ "extra_info": {
+ 'split': split,
+ 'index': idx,
+ }
+ }
+ return data
+
+ return process_fn
+
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
+ all_dataset.append(train_dataset)
+
+ local_dir = args.local_dir
+ hdfs_dir = args.hdfs_dir
+
+ all_train_dataset = datasets.concatenate_datasets(all_dataset)
+ all_train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
+
+ if hdfs_dir is not None:
+ makedirs(hdfs_dir)
+
+ copy(src=local_dir, dst=hdfs_dir)
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/download.py b/code/RL_model/verl/Search-R1/misc/scripts/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8438a45711e4d170f6b3e6c34ca740951a9ab70
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/download.py
@@ -0,0 +1,25 @@
+import argparse
+from huggingface_hub import hf_hub_download
+
+parser = argparse.ArgumentParser(description="Download files from a Hugging Face dataset repository.")
+parser.add_argument("--repo_id", type=str, default="PeterJinGo/wiki-18-e5-index", help="Hugging Face repository ID")
+parser.add_argument("--save_path", type=str, required=True, help="Local directory to save files")
+
+args = parser.parse_args()
+
+repo_id = "PeterJinGo/wiki-18-e5-index"
+for file in ["part_aa", "part_ab"]:
+ hf_hub_download(
+ repo_id=repo_id,
+ filename=file, # e.g., "e5_Flat.index"
+ repo_type="dataset",
+ local_dir=args.save_path,
+ )
+
+repo_id = "PeterJinGo/wiki-18-corpus"
+hf_hub_download(
+ repo_id=repo_id,
+ filename="wiki-18.jsonl.gz",
+ repo_type="dataset",
+ local_dir=args.save_path,
+)
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/download.sh b/code/RL_model/verl/Search-R1/misc/scripts/download.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e33e717dfb2900a885d600396dc6bbd9921a1c1c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/download.sh
@@ -0,0 +1,6 @@
+
+save_path=/home/peterjin/debug_cache
+
+python download.py --savepath $savepath
+
+cat $save_path/part_* > e5_Flat.index
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/README.md b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad48c1169e9a9687c057662df49bcd15784e4bcc
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/README.md
@@ -0,0 +1,42 @@
+
+## Reproduce the paper results
+
+### Download the dataset
+
+```bash
+huggingface-cli download --repo-type dataset PeterJinGo/nq_hotpotqa_train --local-dir $WORK_DIR/data/nq_hotpotqa_train
+```
+
+### Launch the local search engine
+
+(1) Download the indexing and corpus.
+```bash
+save_path=/the/path/to/save
+python scripts/download.py --save_path $save_path
+cat $save_path/part_* > $save_path/e5_Flat.index
+gzip -d $save_path/wiki-18.jsonl.gz
+```
+
+(2) Launch a local retrieval server.
+```bash
+conda activate retriever
+bash retrieval_launch.sh
+```
+
+### Run PPO training
+```bash
+bash train_ppo.sh
+```
+
+
+### Run GRPO training
+```bash
+bash train_grpo.sh
+```
+
+### Run evaluation
+```bash
+bash evaluate.sh
+```
+
+You can change ```$BASE_MODEL``` to the path of the model you would like to evaluate.
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/data_process.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/data_process.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ae1b45be776bf596c3c93e79315fd334ee6d5407
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/data_process.sh
@@ -0,0 +1,10 @@
+WORK_DIR=your/work/dir
+LOCAL_DIR=$WORK_DIR/data/nq_hotpotqa_train
+
+## process multiple dataset search format train file
+DATA=nq,hotpotqa
+python $WORK_DIR/scripts/data_process/qa_search_train_merge.py --local_dir $LOCAL_DIR --data_sources $DATA
+
+## process multiple dataset search format test file
+DATA=nq,triviaqa,popqa,hotpotqa,2wikimultihopqa,musique,bamboogle
+python $WORK_DIR/scripts/data_process/qa_search_test_merge.py --local_dir $LOCAL_DIR --data_sources $DATA
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/evaluate.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1b0067fda90778d0d5d3a8b0c8bf6aef2a7024b1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/evaluate.sh
@@ -0,0 +1,65 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+export BASE_MODEL=""
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=gae \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.95 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.rollout.n_agent=1 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=true \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.optim.lr_warmup_steps_ratio=0.05 \
+ critic.model.path=$BASE_MODEL \
+ critic.model.enable_gradient_checkpointing=true \
+ critic.ppo_micro_batch_size=8 \
+ critic.model.fsdp_config.param_offload=true \
+ critic.model.fsdp_config.grad_offload=true \
+ critic.model.fsdp_config.optimizer_offload=true \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ algorithm.no_think_rl=false \
+ trainer.critic_warmup=0 \
+ trainer.logger=[] \
+ +trainer.val_only=true \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_grpo.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_grpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8386975357a5e3a4ba2ecc465679ad213429f385
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_grpo.sh
@@ -0,0 +1,84 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+
+export BASE_MODEL='meta-llama/Llama-3.2-3B'
+export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.2-3b-em
+# export BASE_MODEL='meta-llama/Llama-3.2-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.2-3b-it-em
+# export BASE_MODEL='meta-llama/Llama-3.1-8B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.1-8b-em
+# export BASE_MODEL='meta-llama/Llama-3.1-8B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.1-8b-it-em
+
+# export BASE_MODEL='Qwen/Qwen2.5-3B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-em
+# export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-it-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-it-em
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=grpo \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.95 \
+ actor_rollout_ref.actor.use_kl_loss=true \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ algorithm.no_think_rl=false \
+ actor_rollout_ref.rollout.n_agent=5 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=true \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=50 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=305 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_ppo.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_ppo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8a060d65caf9a571ddc49c9bc4f0d117dda14b24
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_ppo.sh
@@ -0,0 +1,92 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+
+export BASE_MODEL='meta-llama/Llama-3.2-3B'
+export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.2-3b-em
+# export BASE_MODEL='meta-llama/Llama-3.2-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.2-3b-it-em
+# export BASE_MODEL='meta-llama/Llama-3.1-8B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.1-8b-em
+# export BASE_MODEL='meta-llama/Llama-3.1-8B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.1-8b-it-em
+
+# export BASE_MODEL='Qwen/Qwen2.5-3B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-em
+# export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-it-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-it-em
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=gae \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.95 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.rollout.n_agent=1 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=true \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.optim.lr_warmup_steps_ratio=0.05 \
+ critic.model.path=$BASE_MODEL \
+ critic.model.enable_gradient_checkpointing=true \
+ critic.ppo_micro_batch_size=8 \
+ critic.model.fsdp_config.param_offload=true \
+ critic.model.fsdp_config.grad_offload=true \
+ critic.model.fsdp_config.optimizer_offload=true \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ algorithm.no_think_rl=false \
+ trainer.critic_warmup=0 \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=50 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=305 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_grpo.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_grpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..240acb5a5d7d1e4d99d8e152acee951eb8badbbe
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_grpo.sh
@@ -0,0 +1,79 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+
+# export BASE_MODEL='Qwen/Qwen2.5-3B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-em
+# export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-it-em
+export BASE_MODEL='Qwen/Qwen2.5-7B'
+export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-it-em
+# export BASE_MODEL='Qwen/Qwen2.5-14B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-em
+# export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-it-em
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=grpo \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.use_kl_loss=true \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ algorithm.no_think_rl=false \
+ actor_rollout_ref.rollout.n_agent=5 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=true \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_ppo.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_ppo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..577f17d13c2aa1d486d6af3a605e0030ca4a4387
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_ppo.sh
@@ -0,0 +1,88 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+
+# export BASE_MODEL='Qwen/Qwen2.5-3B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-em
+# export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-it-em
+export BASE_MODEL='Qwen/Qwen2.5-7B'
+export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-it-em
+# export BASE_MODEL='Qwen/Qwen2.5-14B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-14b-em
+# export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-14b-it-em
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=gae \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.rollout.n_agent=1 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.rollout.top_p=1.0 \
+ actor_rollout_ref.actor.state_masking=true \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.optim.lr_warmup_steps_ratio=0.015 \
+ critic.model.path=$BASE_MODEL \
+ critic.model.enable_gradient_checkpointing=true \
+ critic.ppo_micro_batch_size=8 \
+ critic.model.fsdp_config.param_offload=true \
+ critic.model.fsdp_config.grad_offload=true \
+ critic.model.fsdp_config.optimizer_offload=true \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ algorithm.no_think_rl=false \
+ trainer.critic_warmup=0 \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_grpo_format.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_grpo_format.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ec766ca0362ade6d2db5222f82b319be3c111c8b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_grpo_format.sh
@@ -0,0 +1,87 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+
+export BASE_MODEL='Qwen/Qwen2.5-3B'
+export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-it-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-7B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-it-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-14B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-it-em-structureformat
+
+# export BASE_MODEL='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-deepseekr1-7b-em-structureformat
+# export BASE_MODEL='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-deepseekr1-14b-em-structureformat
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo_format \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=grpo \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr=5e-7 \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.use_kl_loss=true \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ algorithm.no_think_rl=false \
+ actor_rollout_ref.rollout.n_agent=5 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=true \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=/home/peterjin/verl_checkpoints/$EXPERIMENT_NAME \
+ reward_model.structure_format_score=0.2 \
+ reward_model.final_format_score=0.1 \
+ reward_model.retrieval_score=0 \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee /home/peterjin/rl_logs/$EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_ppo_format.sh b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_ppo_format.sh
new file mode 100644
index 0000000000000000000000000000000000000000..15ac4df5706d44a7f220c28dc5c6d16c7d5cc715
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_ppo_format.sh
@@ -0,0 +1,94 @@
+data_name=nq_hotpotqa_train
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
+
+WAND_PROJECT="Search-R1"
+
+export BASE_MODEL='Qwen/Qwen2.5-3B'
+export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-it-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-7B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-it-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-14B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-em-structureformat
+# export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
+# export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-it-em-structureformat
+
+# export BASE_MODEL='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B'
+# export EXPERIMENT_NAME=${data_name}-search-r1-ppo-deepseekr1-14b-em-structureformat
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo_format \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=gae \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.rollout.n_agent=1 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.rollout.top_p=1.0 \
+ actor_rollout_ref.actor.state_masking=true \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.optim.lr_warmup_steps_ratio=0.015 \
+ critic.model.path=$BASE_MODEL \
+ critic.model.enable_gradient_checkpointing=true \
+ critic.ppo_micro_batch_size=8 \
+ critic.model.fsdp_config.param_offload=true \
+ critic.model.fsdp_config.grad_offload=true \
+ critic.model.fsdp_config.optimizer_offload=true \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ algorithm.no_think_rl=false \
+ trainer.critic_warmup=0 \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=100 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=/home/peterjin/verl_checkpoints/$EXPERIMENT_NAME \
+ reward_model.structure_format_score=0.2 \
+ reward_model.final_format_score=0.1 \
+ reward_model.retrieval_score=0 \
+ max_turns=4 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee /home/peterjin/rl_logs/$EXPERIMENT_NAME.log
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/upload.py b/code/RL_model/verl/Search-R1/misc/scripts/upload.py
new file mode 100644
index 0000000000000000000000000000000000000000..236339730a881d7dcf7151b975ad4f3550239811
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/upload.py
@@ -0,0 +1,12 @@
+import os
+from huggingface_hub import upload_file
+
+repo_id = "PeterJinGo/wiki-18-e5-index"
+path = "/home/peterjin/mnt/index/wiki-18"
+for file in ["part_aa", "part_ab"]:
+ upload_file(
+ path_or_fileobj=os.path.join(path, file), # File path
+ path_in_repo=file, # Destination filename in the repo
+ repo_id=repo_id, # Your dataset repo ID
+ repo_type="dataset"
+ )
diff --git a/code/RL_model/verl/Search-R1/misc/scripts/upload.sh b/code/RL_model/verl/Search-R1/misc/scripts/upload.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0c3a21c79004acfed33e37c1662e411e634d0399
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/misc/scripts/upload.sh
@@ -0,0 +1,6 @@
+
+index=/home/peterjin/mnt/index/wiki-18/e5_Flat.index
+
+split -b 40G $index part_
+
+python upload.py
diff --git a/code/RL_model/verl/Search-R1/pyproject.toml b/code/RL_model/verl/Search-R1/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..3d361848f54f7fc2da0b6cfafedfadc42e91de7b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/pyproject.toml
@@ -0,0 +1,78 @@
+# -------------------------------
+# build-system
+# -------------------------------
+[build-system]
+requires = [
+ "setuptools>=61.0",
+ "wheel"
+]
+build-backend = "setuptools.build_meta"
+
+# -------------------------------
+# project (PEP 621 metadata)
+# -------------------------------
+[project]
+name = "verl"
+# We'll mark the version as "dynamic" because it's read from the file "verl/version/version"
+# (PEP 621 calls this "dynamic version").
+# The actual version is specified in the [tool.setuptools.dynamic] section below.
+dynamic = ["version"]
+
+description = "veRL: Volcano Engine Reinforcement Learning for LLM"
+license = {file = "LICENSE"} # or "Apache-2.0", if you prefer an SPDX identifier
+readme = {file = "README.md", content-type = "text/markdown"}
+requires-python = ">=3.8"
+
+authors = [
+ { name = "Bytedance - Seed - MLSys", email = "zhangchi.usc1992@bytedance.com" },
+ { name = "Bytedance - Seed - MLSys", email = "gmsheng@connect.hku.hk" },
+]
+
+# Dependencies corresponding to install_requires in setup.py
+dependencies = [
+ "accelerate",
+ "codetiming",
+ "datasets",
+ "dill",
+ "hydra-core",
+ "numpy",
+ "pybind11",
+ "ray",
+ "tensordict",
+ "transformers<4.48",
+ "vllm<=0.6.3",
+]
+
+# Optional dependencies (extras_require in setup.py)
+[project.optional-dependencies]
+test = [
+ "pytest", "yapf"
+]
+
+# URLs
+[project.urls]
+Homepage = "https://github.com/volcengine/verl"
+
+# -------------------------------
+# tool.setuptools - Additional config
+# -------------------------------
+[tool.setuptools]
+# True means `setuptools` will attempt to include all relevant files in package_data automatically.
+# This corresponds to `include_package_data=True` in setup.py.
+include-package-data = true
+
+# We read the version from a file in 'verl/version/version'
+[tool.setuptools.dynamic]
+version = {file = "verl/version/version"}
+
+# If you need to mimic `package_dir={'': '.'}`:
+[tool.setuptools.package-dir]
+"" = "."
+
+# If you need to include specific non-Python data (like YAML files or version file):
+# This is the rough equivalent of package_data={'': ['version/*'], 'verl': ['trainer/config/*.yaml']}
+[tool.setuptools.package-data]
+verl = [
+ "version/*",
+ "trainer/config/*.yaml"
+]
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/requirements.txt b/code/RL_model/verl/Search-R1/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5381179bae61fa7ef65e98b483544e57b0f671bb
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/requirements.txt
@@ -0,0 +1,16 @@
+accelerate
+codetiming
+datasets
+dill
+flash-attn
+hydra-core
+numpy
+pandas
+pybind11
+ray
+tensordict<0.6
+transformers<4.48
+vllm<=0.6.3
+wandb
+IPython
+matplotlib
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/retrieval_launch.sh b/code/RL_model/verl/Search-R1/retrieval_launch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c561b1fc0eaf69472ece7eb96afd42c0186ff284
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/retrieval_launch.sh
@@ -0,0 +1,13 @@
+
+file_path=/the/path/you/save/corpus
+index_file=$file_path/e5_Flat.index
+corpus_file=$file_path/wiki-18.jsonl
+retriever_name=e5
+retriever_path=intfloat/e5-base-v2
+
+python search_r1/search/retrieval_server.py --index_path $index_file \
+ --corpus_path $corpus_file \
+ --topk 3 \
+ --retriever_name $retriever_name \
+ --retriever_model $retriever_path \
+ --faiss_gpu
diff --git a/code/RL_model/verl/Search-R1/search_r1/__init__.py b/code/RL_model/verl/Search-R1/search_r1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/__init__.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b68cb003ac3f943d45eb8d5cf48a7ebee5cd1f6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/llm_agent/generation.py
@@ -0,0 +1,469 @@
+import torch
+import re
+from collections import defaultdict
+import os
+from typing import List, Dict, Any, Tuple
+from dataclasses import dataclass
+from .tensor_helper import TensorHelper, TensorConfig
+from verl import DataProto
+from verl.utils.tracking import Tracking
+import shutil
+import requests
+
+@dataclass
+class GenerationConfig:
+ max_turns: int
+ max_start_length: int
+ max_prompt_length: int
+ max_response_length: int
+ max_obs_length: int
+ num_gpus: int
+ no_think_rl: bool=False
+ search_url: str = None
+ topk: int = 3
+
+class LLMGenerationManager:
+ def __init__(
+ self,
+ tokenizer,
+ actor_rollout_wg,
+ config: GenerationConfig,
+ is_validation: bool = False,
+ ):
+ self.tokenizer = tokenizer
+ self.actor_rollout_wg = actor_rollout_wg
+ self.config = config
+ self.is_validation = is_validation
+
+ self.tensor_fn = TensorHelper(TensorConfig(
+ pad_token_id=tokenizer.pad_token_id,
+ max_prompt_length=config.max_prompt_length,
+ max_obs_length=config.max_obs_length,
+ max_start_length=config.max_start_length
+ ))
+
+ def _batch_tokenize(self, responses: List[str]) -> torch.Tensor:
+ """Tokenize a batch of responses."""
+ return self.tokenizer(
+ responses,
+ add_special_tokens=False,
+ return_tensors='pt',
+ padding="longest"
+ )['input_ids']
+
+ def _postprocess_responses(self, responses: torch.Tensor) -> torch.Tensor:
+ """Process responses to stop at search operation or answer operation."""
+ responses_str = self.tokenizer.batch_decode(
+ responses,
+ skip_special_tokens=True
+ )
+
+ responses_str = [resp.split('')[0] + ''
+ if '' in resp
+ else resp.split('')[0] + ''
+ if '' in resp
+ else resp
+ for resp in responses_str]
+
+ if self.config.no_think_rl:
+ raise ValueError('stop')
+ # if no_think_rl is enabled, only keep action in the str
+ actions, _ = self.env.postprocess_predictions(responses_str)
+ responses_str=[f"{envs[idx].ACTION_LOOKUP[action]}" for idx, action in enumerate(actions)]
+ print("RESPONSES:", responses_str)
+ responses = self._batch_tokenize(responses_str)
+ return responses, responses_str
+
+ def _process_next_obs(self, next_obs: List[str]) -> torch.Tensor:
+ """Process next observations from environment."""
+
+ next_obs_ids = self.tokenizer(
+ next_obs,
+ padding='longest',
+ return_tensors='pt',
+ add_special_tokens=False, # Prevents adding special tokens
+ )['input_ids']
+
+ if next_obs_ids.shape[1] > self.config.max_obs_length:
+ print(f"[WARNING] OBSERVATION TOO LONG, CONSIDER CHANGING YOUR CONFIG, {next_obs_ids.shape[1]} & {self.config.max_obs_length}")
+ next_obs_ids = next_obs_ids[:, :self.config.max_obs_length]
+
+ return next_obs_ids
+
+ def _update_rolling_state(self, rollings: DataProto, cur_responses: torch.Tensor,
+ next_obs_ids: torch.Tensor) -> Dict:
+ """Update rolling state with new responses and observations."""
+ # Concatenate and handle padding
+ new_input_ids = self.tensor_fn.concatenate_with_padding([
+ rollings.batch['input_ids'],
+ cur_responses,
+ next_obs_ids
+ ])
+
+ # Create attention mask and position ids
+ new_attention_mask = self.tensor_fn.create_attention_mask(new_input_ids)
+ new_position_ids = self.tensor_fn.create_position_ids(new_attention_mask)
+
+ # Cut to appropriate length
+ effective_len = new_attention_mask.sum(dim=1).max()
+ max_len = min(self.config.max_prompt_length, effective_len)
+
+ new_rollings = DataProto.from_dict({
+ 'input_ids': new_input_ids[:, -max_len:],
+ 'position_ids': new_position_ids[:, -max_len:],
+ 'attention_mask': new_attention_mask[:, -max_len:]
+ })
+ new_rollings.meta_info.update(rollings.meta_info)
+
+ return new_rollings
+
+ def _info_masked_concatenate_with_padding(self,
+ prompt: torch.Tensor,
+ prompt_with_mask: torch.Tensor,
+ response: torch.Tensor,
+ info: torch.Tensor = None,
+ pad_to_left: bool = True
+ ) -> torch.Tensor:
+ """Concatenate tensors and handle padding. Additionally, create a mask (info_mask) to cover the information block if it exists."""
+ pad_id = self.tokenizer.pad_token_id
+ tensors = [prompt, response]
+ tensors_with_mask = [prompt_with_mask, response]
+ if info is not None:
+ tensors.append(info)
+ info_mask = torch.full(info.size(), pad_id, dtype=info.dtype, device=info.device) # information mask
+ tensors_with_mask.append(info_mask)
+
+ concatenated = torch.cat(tensors, dim=1)
+ concatenated_with_info = torch.cat(tensors_with_mask, dim=1)
+ mask = concatenated != pad_id if pad_to_left else concatenated == pad_id
+ sorted_indices = mask.to(torch.int64).argsort(dim=1, stable=True)
+ padded_tensor = concatenated.gather(1, sorted_indices)
+ padded_tensor_with_info = concatenated_with_info.gather(1, sorted_indices)
+
+ return padded_tensor, padded_tensor_with_info
+
+ def _update_right_side(self, right_side: Dict,
+ cur_responses: torch.Tensor,
+ next_obs_ids: torch.Tensor = None) -> Dict:
+ """Update right side state."""
+ if next_obs_ids != None:
+ responses, responses_with_info_mask = self._info_masked_concatenate_with_padding(
+ right_side['responses'],
+ right_side['responses_with_info_mask'],
+ cur_responses,
+ next_obs_ids,
+ pad_to_left=False
+ )
+ else:
+ responses, responses_with_info_mask = self._info_masked_concatenate_with_padding(
+ right_side['responses'],
+ right_side['responses_with_info_mask'],
+ cur_responses,
+ pad_to_left=False
+ )
+ effective_len = self.tensor_fn.create_attention_mask(responses).sum(dim=1).max()
+ max_len = min(self.config.max_prompt_length, effective_len)
+
+ return {'responses': responses[:, :max_len], 'responses_with_info_mask': responses_with_info_mask[:, :max_len]}
+
+ def _generate_with_gpu_padding(self, active_batch: DataProto) -> DataProto:
+ """
+ Wrapper for generation that handles multi-GPU padding requirements.
+ if num_gpus <= 1, return self.actor_rollout_wg.generate_sequences(active_batch)
+ if active_batch size is not divisible by num_gpus, pad with first sequence
+ then remove padding from output
+ """
+ num_gpus = self.config.num_gpus
+ if num_gpus <= 1:
+ return self.actor_rollout_wg.generate_sequences(active_batch)
+
+ batch_size = active_batch.batch['input_ids'].shape[0]
+ remainder = batch_size % num_gpus
+
+ for key in active_batch.batch.keys():
+ active_batch.batch[key] = active_batch.batch[key].long()
+ if remainder == 0:
+ return self.actor_rollout_wg.generate_sequences(active_batch)
+
+ # Add padding sequences
+ padding_size = num_gpus - remainder
+ padded_batch = {}
+
+ for k, v in active_batch.batch.items():
+ # Use first sequence as padding template
+ pad_sequence = v[0:1].repeat(padding_size, *[1] * (len(v.shape) - 1))
+ padded_batch[k] = torch.cat([v, pad_sequence], dim=0)
+
+ padded_active_batch = DataProto.from_dict(padded_batch)
+ for key in padded_active_batch.batch.keys():
+ padded_active_batch.batch[key] = padded_active_batch.batch[key].long()
+
+ # Generate with padded batch
+ padded_output = self.actor_rollout_wg.generate_sequences(padded_active_batch)
+
+ # Remove padding from output
+ trimmed_batch = {k: v[:-padding_size] for k, v in padded_output.batch.items()}
+
+ # Handle meta_info if present
+ if hasattr(padded_output, 'meta_info') and padded_output.meta_info:
+ trimmed_meta = {}
+ for k, v in padded_output.meta_info.items():
+ if isinstance(v, torch.Tensor):
+ trimmed_meta[k] = v[:-padding_size]
+ else:
+ trimmed_meta[k] = v
+ padded_output.meta_info = trimmed_meta
+
+ padded_output.batch = trimmed_batch
+ return padded_output
+
+ def run_llm_loop(self, gen_batch, initial_input_ids: torch.Tensor) -> Tuple[Dict, Dict]:
+ """Run main LLM generation loop."""
+
+ original_left_side = {'input_ids': initial_input_ids[:, -self.config.max_start_length:]}
+ original_right_side = {'responses': initial_input_ids[:, []], 'responses_with_info_mask': initial_input_ids[:, []]}
+
+ active_mask = torch.ones(gen_batch.batch['input_ids'].shape[0], dtype=torch.bool)
+ turns_stats = torch.ones(gen_batch.batch['input_ids'].shape[0], dtype=torch.int)
+ valid_action_stats = torch.zeros(gen_batch.batch['input_ids'].shape[0], dtype=torch.int)
+ valid_search_stats = torch.zeros(gen_batch.batch['input_ids'].shape[0], dtype=torch.int)
+ active_num_list = [active_mask.sum().item()]
+ rollings = gen_batch
+
+ # Main generation loop
+ for step in range(self.config.max_turns):
+ if not active_mask.sum():
+ break
+ rollings.batch = self.tensor_fn.cut_to_effective_len(
+ rollings.batch,
+ keys=['input_ids', 'attention_mask', 'position_ids']
+ )
+
+ # gen_output = self.actor_rollout_wg.generate_sequences(rollings)
+ rollings_active = DataProto.from_dict({
+ k: v[active_mask] for k, v in rollings.batch.items()
+ })
+ gen_output = self._generate_with_gpu_padding(rollings_active)
+
+ meta_info = gen_output.meta_info
+ responses_ids, responses_str = self._postprocess_responses(gen_output.batch['responses'])
+ responses_ids, responses_str = self.tensor_fn._example_level_pad(responses_ids, responses_str, active_mask)
+
+ # Execute in environment and process observations
+ next_obs, dones, valid_action, is_search = self.execute_predictions(
+ responses_str, self.tokenizer.pad_token, active_mask
+ )
+
+ curr_active_mask = torch.tensor([not done for done in dones], dtype=torch.bool)
+ active_mask = active_mask * curr_active_mask
+ active_num_list.append(active_mask.sum().item())
+ turns_stats[curr_active_mask] += 1
+ valid_action_stats += torch.tensor(valid_action, dtype=torch.int)
+ valid_search_stats += torch.tensor(is_search, dtype=torch.int)
+
+ next_obs_ids = self._process_next_obs(next_obs)
+
+ # Update states
+ rollings = self._update_rolling_state(
+ rollings,
+ responses_ids,
+ next_obs_ids
+ )
+ original_right_side = self._update_right_side(
+ original_right_side,
+ responses_ids,
+ next_obs_ids
+ )
+
+ # final LLM rollout
+ if active_mask.sum():
+ rollings.batch = self.tensor_fn.cut_to_effective_len(
+ rollings.batch,
+ keys=['input_ids', 'attention_mask', 'position_ids']
+ )
+
+ # gen_output = self.actor_rollout_wg.generate_sequences(rollings)
+ rollings_active = DataProto.from_dict({
+ k: v[active_mask] for k, v in rollings.batch.items()
+ })
+ gen_output = self._generate_with_gpu_padding(rollings_active)
+
+ meta_info = gen_output.meta_info
+ responses_ids, responses_str = self._postprocess_responses(gen_output.batch['responses'])
+ responses_ids, responses_str = self.tensor_fn._example_level_pad(responses_ids, responses_str, active_mask)
+
+ # # Execute in environment and process observations
+ _, dones, valid_action, is_search = self.execute_predictions(
+ responses_str, self.tokenizer.pad_token, active_mask, do_search=False
+ )
+
+ curr_active_mask = torch.tensor([not done for done in dones], dtype=torch.bool)
+ active_mask = active_mask * curr_active_mask
+ active_num_list.append(active_mask.sum().item())
+ valid_action_stats += torch.tensor(valid_action, dtype=torch.int)
+ valid_search_stats += torch.tensor(is_search, dtype=torch.int)
+
+
+ original_right_side = self._update_right_side(
+ original_right_side,
+ responses_ids,
+ )
+
+ meta_info['turns_stats'] = turns_stats.tolist()
+ meta_info['active_mask'] = active_mask.tolist()
+ meta_info['valid_action_stats'] = valid_action_stats.tolist()
+ meta_info['valid_search_stats'] = valid_search_stats.tolist()
+
+ print("ACTIVE_TRAJ_NUM:", active_num_list)
+
+ return self._compose_final_output(original_left_side, original_right_side, meta_info)
+
+ def _compose_final_output(self, left_side: Dict,
+ right_side: Dict,
+ meta_info: Dict) -> Tuple[Dict, Dict]:
+ """Compose final generation output."""
+ final_output = right_side.copy()
+ final_output['prompts'] = left_side['input_ids']
+
+ # Combine input IDs
+ final_output['input_ids'] = torch.cat([
+ left_side['input_ids'],
+ right_side['responses']
+ ], dim=1)
+
+ # Create attention mask and position ids
+ final_output['attention_mask'] = torch.cat([
+ self.tensor_fn.create_attention_mask(left_side['input_ids']),
+ self.tensor_fn.create_attention_mask(final_output['responses'])
+ ], dim=1)
+ final_output['info_mask'] = torch.cat([
+ self.tensor_fn.create_attention_mask(left_side['input_ids']),
+ self.tensor_fn.create_attention_mask(final_output['responses_with_info_mask'])
+ ], dim=1)
+
+ final_output['position_ids'] = self.tensor_fn.create_position_ids(
+ final_output['attention_mask']
+ )
+
+ final_output = DataProto.from_dict(final_output)
+ final_output.meta_info.update(meta_info)
+
+ return final_output
+
+ def execute_predictions(self, predictions: List[str], pad_token: str, active_mask=None, do_search=True) -> List[str]:
+ """
+ Execute predictions across multiple environments.
+ NOTE: the function is the actual `step` function in the environment
+ NOTE penalty_for_invalid is not included in observation shown to the LLM
+
+ Args:
+ envs: List of environment instances
+ predictions: List of action predictions
+ pad_token: Token to use for padding
+
+ Returns:
+ List of observation strings
+ """
+ cur_actions, contents = self.postprocess_predictions(predictions)
+ next_obs, dones, valid_action, is_search = [], [], [], []
+
+ search_queries = [content for action, content in zip(cur_actions, contents) if action == 'search']
+ if do_search:
+ search_results = self.batch_search(search_queries)
+ assert len(search_results) == sum([1 for action in cur_actions if action == 'search'])
+ else:
+ search_results = [''] * sum([1 for action in cur_actions if action == 'search'])
+
+ for i, (action, active) in enumerate(zip(cur_actions, active_mask)):
+
+ if not active:
+ next_obs.append('')
+ dones.append(1)
+ valid_action.append(0)
+ is_search.append(0)
+ else:
+ if action == 'answer':
+ next_obs.append('')
+ dones.append(1)
+ valid_action.append(1)
+ is_search.append(0)
+ elif action == 'search':
+ next_obs.append(f'\n\n{search_results.pop(0).strip()}\n\n')
+ dones.append(0)
+ valid_action.append(1)
+ is_search.append(1)
+ else:
+ next_obs.append(f'\nMy previous action is invalid. \
+If I want to search, I should put the query between and . \
+If I want to give the final answer, I should put the answer between and . Let me try again.\n')
+ dones.append(0)
+ valid_action.append(0)
+ is_search.append(0)
+
+ assert len(search_results) == 0
+
+ return next_obs, dones, valid_action, is_search
+
+ def postprocess_predictions(self, predictions: List[Any]) -> Tuple[List[int], List[bool]]:
+ """
+ Process (text-based) predictions from llm into actions and validity flags.
+
+ Args:
+ predictions: List of raw predictions
+
+ Returns:
+ Tuple of (actions list, validity flags list)
+ """
+ actions = []
+ contents = []
+
+ for prediction in predictions:
+ if isinstance(prediction, str): # for llm output
+ pattern = r'<(search|answer)>(.*?)\1>'
+ match = re.search(pattern, prediction, re.DOTALL)
+ if match:
+ content = match.group(2).strip() # Return only the content inside the tags
+ action = match.group(1)
+ else:
+ content = ''
+ action = None
+ else:
+ raise ValueError(f"Invalid prediction type: {type(prediction)}")
+
+ actions.append(action)
+ contents.append(content)
+
+ return actions, contents
+
+ def batch_search(self, queries: List[str] = None) -> str:
+ """
+ Batchified search for queries.
+ Args:
+ queries: queries to call the search engine
+ Returns:
+ search results which is concatenated into a string
+ """
+ results = self._batch_search(queries)['result']
+
+ return [self._passages2string(result) for result in results]
+
+ def _batch_search(self, queries):
+
+ payload = {
+ "queries": queries,
+ "topk": self.config.topk,
+ "return_scores": True
+ }
+
+ return requests.post(self.config.search_url, json=payload).json()
+
+ def _passages2string(self, retrieval_result):
+ format_reference = ''
+ for idx, doc_item in enumerate(retrieval_result):
+
+ content = doc_item['document']['contents']
+ title = content.split("\n")[0]
+ text = "\n".join(content.split("\n")[1:])
+ format_reference += f"Doc {idx+1}(Title: {title}) {text}\n"
+
+ return format_reference
diff --git a/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py b/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..15a7c7c084c4f952533f43b214f987db81075255
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/llm_agent/tensor_helper.py
@@ -0,0 +1,75 @@
+import torch
+from typing import Dict, Tuple, List
+from dataclasses import dataclass
+
+@dataclass
+class TensorConfig:
+ pad_token_id: int
+ max_prompt_length: int
+ max_obs_length: int
+ max_start_length: int
+
+class TensorHelper:
+ def __init__(self, config: TensorConfig):
+ self.config = config
+
+ def cut_to_effective_len(self, tensor_dict: Dict[str, torch.Tensor],
+ keys: List[str], cut_left: bool = True) -> Dict[str, torch.Tensor]:
+ """Cut tensors to their effective length based on attention mask."""
+ effective_len = tensor_dict['attention_mask'].sum(dim=1).max()
+ result = tensor_dict.copy()
+
+ for key in keys:
+ if cut_left:
+ result[key] = tensor_dict[key][:, -effective_len:]
+ else:
+ result[key] = tensor_dict[key][:, :effective_len]
+ return result
+
+ def convert_pad_structure(self, tensor: torch.Tensor, pad_to_left: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Convert padding structure and return sorted tensor with indices."""
+ mask = tensor != self.config.pad_token_id if pad_to_left else tensor == self.config.pad_token_id
+ sorted_indices = mask.to(torch.int64).argsort(dim=1, stable=True)
+ return tensor.gather(1, sorted_indices), sorted_indices
+
+ def create_attention_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
+ """Create attention mask from input ids."""
+ return torch.where(input_ids != self.config.pad_token_id, 1, 0)
+
+ def create_position_ids(self, attention_mask: torch.Tensor) -> torch.Tensor:
+ """Create position ids from attention mask."""
+ return (torch.cumsum(attention_mask, dim=1) - 1) * attention_mask
+
+ def concatenate_with_padding(self, tensors: List[torch.Tensor],
+ pad_to_left: bool = True) -> torch.Tensor:
+ """Concatenate tensors and handle padding."""
+ concatenated = torch.cat(tensors, dim=1)
+ padded_tensor, _ = self.convert_pad_structure(concatenated, pad_to_left)
+ return padded_tensor
+
+ def _example_level_pad(self, responses: torch.Tensor,
+ responses_str: List[str],
+ active_mask: torch.Tensor) -> Tuple[torch.Tensor, List[str]]:
+ """
+ Pad responses for non-active examples with pad tokens.
+ """
+ assert active_mask.sum() == responses.shape[0]
+ # Create masked responses tensor
+ batch_size = active_mask.shape[0]
+ seq_len = responses.shape[1]
+ padded_responses = torch.full(
+ (batch_size, seq_len), self.config.pad_token_id,
+ dtype=responses.dtype, device=responses.device
+ )
+ padded_responses[active_mask] = responses
+
+ # Create masked response strings
+ padded_responses_str = [""] * batch_size
+
+ s = 0
+ for i, is_active in enumerate(active_mask):
+ if is_active:
+ padded_responses_str[i] = responses_str[s]
+ s += 1
+
+ return padded_responses, padded_responses_str
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh b/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh
new file mode 100644
index 0000000000000000000000000000000000000000..05556a3939471d956360bc1f91d7043e19c73a85
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/build_index.sh
@@ -0,0 +1,19 @@
+
+corpus_file=/your/corpus/jsonl/file # jsonl
+save_dir=/the/path/to/save/index
+retriever_name=e5 # this is for indexing naming
+retriever_model=intfloat/e5-base-v2
+
+# change faiss_type to HNSW32/64/128 for ANN indexing
+# change retriever_name to bm25 for BM25 indexing
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python index_builder.py \
+ --retrieval_method $retriever_name \
+ --model_path $retriever_model \
+ --corpus_path $corpus_file \
+ --save_dir $save_dir \
+ --use_fp16 \
+ --max_length 256 \
+ --batch_size 512 \
+ --pooling_method mean \
+ --faiss_type Flat \
+ --save_embedding
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py b/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad72aeefae69d0796f137557ad8f3bb0d2381be6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/google_search_server.py
@@ -0,0 +1,202 @@
+import os
+import re
+import requests
+import argparse
+import asyncio
+import random
+from typing import List, Optional, Dict
+from concurrent.futures import ThreadPoolExecutor
+
+import chardet
+import aiohttp
+import bs4
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+from googleapiclient.discovery import build
+
+
+# --- CLI Args ---
+parser = argparse.ArgumentParser(description="Launch online search server.")
+parser.add_argument('--api_key', type=str, required=True, help="API key for Google search")
+parser.add_argument('--cse_id', type=str, required=True, help="CSE ID for Google search")
+parser.add_argument('--topk', type=int, default=3, help="Number of results to return per query")
+parser.add_argument('--snippet_only', action='store_true', help="If set, only return snippets; otherwise, return full context.")
+args = parser.parse_args()
+
+
+# --- Config ---
+class OnlineSearchConfig:
+ def __init__(self, topk: int = 3, api_key: Optional[str] = None, cse_id: Optional[str] = None, snippet_only: bool = False):
+ self.topk = topk
+ self.api_key = api_key
+ self.cse_id = cse_id
+ self.snippet_only = snippet_only
+
+
+# --- Utilities ---
+def parse_snippet(snippet: str) -> List[str]:
+ segments = snippet.split("...")
+ return [s.strip() for s in segments if len(s.strip().split()) > 5]
+
+
+def sanitize_search_query(query: str) -> str:
+ # Remove or replace special characters that might cause issues.
+ # This is a basic example; you might need to add more characters or patterns.
+ sanitized_query = re.sub(r'[^\w\s]', ' ', query) # Replace non-alphanumeric and non-whitespace with spaces.
+ sanitized_query = re.sub(r'[\t\r\f\v\n]', ' ', sanitized_query) # replace tab, return, formfeed, vertical tab with spaces.
+ sanitized_query = re.sub(r'\s+', ' ', sanitized_query).strip() #remove duplicate spaces, and trailing/leading spaces.
+
+ return sanitized_query
+
+
+def filter_links(search_results: List[Dict]) -> List[str]:
+ links = []
+ for result in search_results:
+ for item in result.get("items", []):
+ if "mime" in item:
+ continue
+ ext = os.path.splitext(item["link"])[1]
+ if ext in ["", ".html", ".htm", ".shtml"]:
+ links.append(item["link"])
+ return links
+
+
+async def fetch(session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore) -> str:
+ user_agents = [
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P)...",
+ "Mozilla/5.0 AppleWebKit/537.36...",
+ "Mozilla/5.0 (compatible; Googlebot/2.1; +https://www.google.com/bot.html)",
+ ]
+ headers = {"User-Agent": random.choice(user_agents)}
+
+ async with semaphore:
+ try:
+ async with session.get(url, headers=headers) as response:
+ raw = await response.read()
+ detected = chardet.detect(raw)
+ encoding = detected["encoding"] or "utf-8"
+ return raw.decode(encoding, errors="ignore")
+ except (aiohttp.ClientError, asyncio.TimeoutError):
+ return ""
+
+
+async def fetch_all(urls: List[str], limit: int = 8) -> List[str]:
+ semaphore = asyncio.Semaphore(limit)
+ timeout = aiohttp.ClientTimeout(total=5)
+ connector = aiohttp.TCPConnector(limit_per_host=limit, force_close=True)
+
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
+ tasks = [fetch(session, url, semaphore) for url in urls]
+ return await asyncio.gather(*tasks)
+
+
+# --- Search Engine ---
+class OnlineSearchEngine:
+ def __init__(self, config: OnlineSearchConfig):
+ self.config = config
+
+ def collect_context(self, snippet: str, doc: str) -> str:
+ snippets = parse_snippet(snippet)
+ ctx_paras = []
+
+ for s in snippets:
+ pos = doc.replace("\n", " ").find(s)
+ if pos == -1:
+ continue
+ sta = pos
+ while sta > 0 and doc[sta] != "\n":
+ sta -= 1
+ end = pos + len(s)
+ while end < len(doc) and doc[end] != "\n":
+ end += 1
+ para = doc[sta:end].strip()
+ if para not in ctx_paras:
+ ctx_paras.append(para)
+
+ return "\n".join(ctx_paras)
+
+ def fetch_web_content(self, search_results: List[Dict]) -> Dict[str, str]:
+ links = filter_links(search_results)
+ contents = asyncio.run(fetch_all(links))
+ content_dict = {}
+ for html, link in zip(contents, links):
+ soup = bs4.BeautifulSoup(html, "html.parser")
+ text = "\n".join([p.get_text() for p in soup.find_all("p")])
+ content_dict[link] = text
+ return content_dict
+
+ def search(self, search_term: str, num_iter: int = 1) -> List[Dict]:
+ service = build('customsearch', 'v1', developerKey=self.config.api_key)
+ results = []
+ sanitize_search_term = sanitize_search_query(search_term)
+ if search_term.isspace():
+ return results
+ res = service.cse().list(q=sanitize_search_term, cx=self.config.cse_id).execute()
+ results.append(res)
+
+ for _ in range(num_iter - 1):
+ if 'nextPage' not in res.get('queries', {}):
+ break
+ start_idx = res['queries']['nextPage'][0]['startIndex']
+ res = service.cse().list(q=search_term, cx=self.config.cse_id, start=start_idx).execute()
+ results.append(res)
+
+ return results
+
+ def batch_search(self, queries: List[str]) -> List[List[str]]:
+ with ThreadPoolExecutor() as executor:
+ return list(executor.map(self._retrieve_context, queries))
+
+ def _retrieve_context(self, query: str) -> List[str]:
+
+ if self.config.snippet_only:
+ search_results = self.search(query)
+ contexts = []
+ for result in search_results:
+ for item in result.get("items", []):
+ title = item.get("title", "")
+ context = ' '.join(parse_snippet(item.get("snippet", "")))
+ if title != "" or context != "":
+ title = "No title." if not title else title
+ context = "No snippet available." if not context else context
+ contexts.append({
+ 'document': {"contents": f'\"{title}\"\n{context}'},
+ })
+ else:
+ content_dict = self.fetch_web_content(search_results)
+ contexts = []
+ for result in search_results:
+ for item in result.get("items", []):
+ link = item["link"]
+ title = item.get("title", "")
+ snippet = item.get("snippet", "")
+ if link in content_dict:
+ context = self.collect_context(snippet, content_dict[link])
+ if title != "" or context != "":
+ title = "No title." if not title else title
+ context = "No snippet available." if not context else context
+ contexts.append({
+ 'document': {"contents": f'\"{title}\"\n{context}'},
+ })
+
+ return contexts[:self.config.topk]
+
+
+# --- FastAPI App ---
+app = FastAPI(title="Online Search Proxy Server")
+
+class SearchRequest(BaseModel):
+ queries: List[str]
+
+config = OnlineSearchConfig(api_key=args.api_key, cse_id=args.cse_id, topk=args.topk, snippet_only=args.snippet_only)
+engine = OnlineSearchEngine(config)
+
+@app.post("/retrieve")
+def search_endpoint(request: SearchRequest):
+ results = engine.batch_search(request.queries)
+ return {"result": results}
+
+
+if __name__ == "__main__":
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py b/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cba65a65e3656fd6787b5a1fe024c33c630fcaf
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/index_builder.py
@@ -0,0 +1,349 @@
+import os
+import faiss
+import json
+import warnings
+import numpy as np
+from typing import cast, List, Dict
+import shutil
+import subprocess
+import argparse
+import torch
+from tqdm import tqdm
+# from LongRAG.retriever.utils import load_model, load_corpus, pooling
+import datasets
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+
+
+def load_model(
+ model_path: str,
+ use_fp16: bool = False
+ ):
+ model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+ model.eval()
+ model.cuda()
+ if use_fp16:
+ model = model.half()
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+
+ return model, tokenizer
+
+
+def pooling(
+ pooler_output,
+ last_hidden_state,
+ attention_mask = None,
+ pooling_method = "mean"
+ ):
+ if pooling_method == "mean":
+ last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+ elif pooling_method == "cls":
+ return last_hidden_state[:, 0]
+ elif pooling_method == "pooler":
+ return pooler_output
+ else:
+ raise NotImplementedError("Pooling method not implemented!")
+
+
+def load_corpus(corpus_path: str):
+ corpus = datasets.load_dataset(
+ 'json',
+ data_files=corpus_path,
+ split="train",
+ num_proc=4)
+ return corpus
+
+
+class Index_Builder:
+ r"""A tool class used to build an index used in retrieval.
+
+ """
+ def __init__(
+ self,
+ retrieval_method,
+ model_path,
+ corpus_path,
+ save_dir,
+ max_length,
+ batch_size,
+ use_fp16,
+ pooling_method,
+ faiss_type=None,
+ embedding_path=None,
+ save_embedding=False,
+ faiss_gpu=False
+ ):
+
+ self.retrieval_method = retrieval_method.lower()
+ self.model_path = model_path
+ self.corpus_path = corpus_path
+ self.save_dir = save_dir
+ self.max_length = max_length
+ self.batch_size = batch_size
+ self.use_fp16 = use_fp16
+ self.pooling_method = pooling_method
+ self.faiss_type = faiss_type if faiss_type is not None else 'Flat'
+ self.embedding_path = embedding_path
+ self.save_embedding = save_embedding
+ self.faiss_gpu = faiss_gpu
+
+ self.gpu_num = torch.cuda.device_count()
+ # prepare save dir
+ print(self.save_dir)
+ if not os.path.exists(self.save_dir):
+ os.makedirs(self.save_dir)
+ else:
+ if not self._check_dir(self.save_dir):
+ warnings.warn("Some files already exists in save dir and may be overwritten.", UserWarning)
+
+ self.index_save_path = os.path.join(self.save_dir, f"{self.retrieval_method}_{self.faiss_type}.index")
+
+ self.embedding_save_path = os.path.join(self.save_dir, f"emb_{self.retrieval_method}.memmap")
+
+ self.corpus = load_corpus(self.corpus_path)
+
+ print("Finish loading...")
+ @staticmethod
+ def _check_dir(dir_path):
+ r"""Check if the dir path exists and if there is content.
+
+ """
+
+ if os.path.isdir(dir_path):
+ if len(os.listdir(dir_path)) > 0:
+ return False
+ else:
+ os.makedirs(dir_path, exist_ok=True)
+ return True
+
+ def build_index(self):
+ r"""Constructing different indexes based on selective retrieval method.
+
+ """
+ if self.retrieval_method == "bm25":
+ self.build_bm25_index()
+ else:
+ self.build_dense_index()
+
+ def build_bm25_index(self):
+ """Building BM25 index based on Pyserini library.
+
+ Reference: https://github.com/castorini/pyserini/blob/master/docs/usage-index.md#building-a-bm25-index-direct-java-implementation
+ """
+
+ # to use pyserini pipeline, we first need to place jsonl file in the folder
+ self.save_dir = os.path.join(self.save_dir, "bm25")
+ os.makedirs(self.save_dir, exist_ok=True)
+ temp_dir = self.save_dir + "/temp"
+ temp_file_path = temp_dir + "/temp.jsonl"
+ os.makedirs(temp_dir)
+
+ # if self.have_contents:
+ # shutil.copyfile(self.corpus_path, temp_file_path)
+ # else:
+ # with open(temp_file_path, "w") as f:
+ # for item in self.corpus:
+ # f.write(json.dumps(item) + "\n")
+ shutil.copyfile(self.corpus_path, temp_file_path)
+
+ print("Start building bm25 index...")
+ pyserini_args = ["--collection", "JsonCollection",
+ "--input", temp_dir,
+ "--index", self.save_dir,
+ "--generator", "DefaultLuceneDocumentGenerator",
+ "--threads", "1"]
+
+ subprocess.run(["python", "-m", "pyserini.index.lucene"] + pyserini_args)
+
+ shutil.rmtree(temp_dir)
+
+ print("Finish!")
+
+ def _load_embedding(self, embedding_path, corpus_size, hidden_size):
+ all_embeddings = np.memmap(
+ embedding_path,
+ mode="r",
+ dtype=np.float32
+ ).reshape(corpus_size, hidden_size)
+ return all_embeddings
+
+ def _save_embedding(self, all_embeddings):
+ memmap = np.memmap(
+ self.embedding_save_path,
+ shape=all_embeddings.shape,
+ mode="w+",
+ dtype=all_embeddings.dtype
+ )
+ length = all_embeddings.shape[0]
+ # add in batch
+ save_batch_size = 10000
+ if length > save_batch_size:
+ for i in tqdm(range(0, length, save_batch_size), leave=False, desc="Saving Embeddings"):
+ j = min(i + save_batch_size, length)
+ memmap[i: j] = all_embeddings[i: j]
+ else:
+ memmap[:] = all_embeddings
+
+ def encode_all(self):
+ if self.gpu_num > 1:
+ print("Use multi gpu!")
+ self.encoder = torch.nn.DataParallel(self.encoder)
+ self.batch_size = self.batch_size * self.gpu_num
+
+ all_embeddings = []
+
+ for start_idx in tqdm(range(0, len(self.corpus), self.batch_size), desc='Inference Embeddings:'):
+
+ # batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title']
+ # batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text']
+ # batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)]
+ batch_data = self.corpus[start_idx:start_idx+self.batch_size]['contents']
+
+ if self.retrieval_method == "e5":
+ batch_data = [f"passage: {doc}" for doc in batch_data]
+
+ inputs = self.tokenizer(
+ batch_data,
+ padding=True,
+ truncation=True,
+ return_tensors='pt',
+ max_length=self.max_length,
+ ).to('cuda')
+
+ inputs = {k: v.cuda() for k, v in inputs.items()}
+
+ #TODO: support encoder-only T5 model
+ if "T5" in type(self.encoder).__name__:
+ # T5-based retrieval model
+ decoder_input_ids = torch.zeros(
+ (inputs['input_ids'].shape[0], 1), dtype=torch.long
+ ).to(inputs['input_ids'].device)
+ output = self.encoder(
+ **inputs, decoder_input_ids=decoder_input_ids, return_dict=True
+ )
+ embeddings = output.last_hidden_state[:, 0, :]
+
+ else:
+ output = self.encoder(**inputs, return_dict=True)
+ embeddings = pooling(output.pooler_output,
+ output.last_hidden_state,
+ inputs['attention_mask'],
+ self.pooling_method)
+ if "dpr" not in self.retrieval_method:
+ embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
+
+ embeddings = cast(torch.Tensor, embeddings)
+ embeddings = embeddings.detach().cpu().numpy()
+ all_embeddings.append(embeddings)
+
+ all_embeddings = np.concatenate(all_embeddings, axis=0)
+ all_embeddings = all_embeddings.astype(np.float32)
+
+ return all_embeddings
+
+ @torch.no_grad()
+ def build_dense_index(self):
+ """Obtain the representation of documents based on the embedding model(BERT-based) and
+ construct a faiss index.
+ """
+
+ if os.path.exists(self.index_save_path):
+ print("The index file already exists and will be overwritten.")
+
+ self.encoder, self.tokenizer = load_model(model_path = self.model_path,
+ use_fp16 = self.use_fp16)
+ if self.embedding_path is not None:
+ hidden_size = self.encoder.config.hidden_size
+ corpus_size = len(self.corpus)
+ all_embeddings = self._load_embedding(self.embedding_path, corpus_size, hidden_size)
+ else:
+ all_embeddings = self.encode_all()
+ if self.save_embedding:
+ self._save_embedding(all_embeddings)
+ del self.corpus
+
+ # build index
+ print("Creating index")
+ dim = all_embeddings.shape[-1]
+ faiss_index = faiss.index_factory(dim, self.faiss_type, faiss.METRIC_INNER_PRODUCT)
+
+ if self.faiss_gpu:
+ co = faiss.GpuMultipleClonerOptions()
+ co.useFloat16 = True
+ co.shard = True
+ faiss_index = faiss.index_cpu_to_all_gpus(faiss_index, co)
+ if not faiss_index.is_trained:
+ faiss_index.train(all_embeddings)
+ faiss_index.add(all_embeddings)
+ faiss_index = faiss.index_gpu_to_cpu(faiss_index)
+ else:
+ if not faiss_index.is_trained:
+ faiss_index.train(all_embeddings)
+ faiss_index.add(all_embeddings)
+
+ faiss.write_index(faiss_index, self.index_save_path)
+ print("Finish!")
+
+
+MODEL2POOLING = {
+ "e5": "mean",
+ "bge": "cls",
+ "contriever": "mean",
+ 'jina': 'mean'
+}
+
+
+def main():
+ parser = argparse.ArgumentParser(description = "Creating index.")
+
+ # Basic parameters
+ parser.add_argument('--retrieval_method', type=str)
+ parser.add_argument('--model_path', type=str, default=None)
+ parser.add_argument('--corpus_path', type=str)
+ parser.add_argument('--save_dir', default= 'indexes/',type=str)
+
+ # Parameters for building dense index
+ parser.add_argument('--max_length', type=int, default=180)
+ parser.add_argument('--batch_size', type=int, default=512)
+ parser.add_argument('--use_fp16', default=False, action='store_true')
+ parser.add_argument('--pooling_method', type=str, default=None)
+ parser.add_argument('--faiss_type',default=None,type=str)
+ parser.add_argument('--embedding_path', default=None, type=str)
+ parser.add_argument('--save_embedding', action='store_true', default=False)
+ parser.add_argument('--faiss_gpu', default=False, action='store_true')
+
+ args = parser.parse_args()
+
+ if args.pooling_method is None:
+ pooling_method = 'mean'
+ for k,v in MODEL2POOLING.items():
+ if k in args.retrieval_method.lower():
+ pooling_method = v
+ break
+ else:
+ if args.pooling_method not in ['mean','cls','pooler']:
+ raise NotImplementedError
+ else:
+ pooling_method = args.pooling_method
+
+
+ index_builder = Index_Builder(
+ retrieval_method = args.retrieval_method,
+ model_path = args.model_path,
+ corpus_path = args.corpus_path,
+ save_dir = args.save_dir,
+ max_length = args.max_length,
+ batch_size = args.batch_size,
+ use_fp16 = args.use_fp16,
+ pooling_method = pooling_method,
+ faiss_type = args.faiss_type,
+ embedding_path = args.embedding_path,
+ save_embedding = args.save_embedding,
+ faiss_gpu = args.faiss_gpu
+ )
+ index_builder.build_index()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py b/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..9edabe881bbc685786d6dde292ae8e72b0216aae
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/rerank_server.py
@@ -0,0 +1,161 @@
+import argparse
+from collections import defaultdict
+from typing import Optional
+from dataclasses import dataclass, field
+
+from sentence_transformers import CrossEncoder
+import torch
+from transformers import HfArgumentParser
+import numpy as np
+
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+
+class BaseCrossEncoder:
+ def __init__(self, model, batch_size=32, device="cuda"):
+ self.model = model
+ self.batch_size = batch_size
+ self.model.to(device)
+
+ def _passage_to_string(self, doc_item):
+ if "document" not in doc_item:
+ content = doc_item['contents']
+ else:
+ content = doc_item['document']['contents']
+ title = content.split("\n")[0]
+ text = "\n".join(content.split("\n")[1:])
+
+ return f"(Title: {title}) {text}"
+
+ def rerank(self,
+ queries: list[str],
+ documents: list[list[dict]]):
+ """
+ Assume documents is a list of list of dicts, where each dict is a document with keys "id" and "contents".
+ This asumption is made to be consistent with the output of the retrieval server.
+ """
+ assert len(queries) == len(documents)
+
+ pairs = []
+ qids = []
+ for qid, query in enumerate(queries):
+ for document in documents:
+ for doc_item in document:
+ doc = self._passage_to_string(doc_item)
+ pairs.append((query, doc))
+ qids.append(qid)
+
+ scores = self._predict(pairs)
+ query_to_doc_scores = defaultdict(list)
+
+ assert len(scores) == len(pairs) == len(qids)
+ for i in range(len(pairs)):
+ query, doc = pairs[i]
+ score = scores[i]
+ qid = qids[i]
+ query_to_doc_scores[qid].append((doc, score))
+
+ sorted_query_to_doc_scores = {}
+ for query, doc_scores in query_to_doc_scores.items():
+ sorted_query_to_doc_scores[query] = sorted(doc_scores, key=lambda x: x[1], reverse=True)
+
+ return sorted_query_to_doc_scores
+
+ def _predict(self, pairs: list[tuple[str, str]]):
+ raise NotImplementedError
+
+ @classmethod
+ def load(cls, model_name_or_path, **kwargs):
+ raise NotImplementedError
+
+
+class SentenceTransformerCrossEncoder(BaseCrossEncoder):
+ def __init__(self, model, batch_size=32, device="cuda"):
+ super().__init__(model, batch_size, device)
+
+ def _predict(self, pairs: list[tuple[str, str]]):
+ scores = self.model.predict(pairs, batch_size=self.batch_size)
+ scores = scores.tolist() if isinstance(scores, torch.Tensor) or isinstance(scores, np.ndarray) else scores
+ return scores
+
+ @classmethod
+ def load(cls, model_name_or_path, **kwargs):
+ model = CrossEncoder(model_name_or_path)
+ return cls(model, **kwargs)
+
+
+class RerankRequest(BaseModel):
+ queries: list[str]
+ documents: list[list[dict]]
+ rerank_topk: Optional[int] = None
+ return_scores: bool = False
+
+
+@dataclass
+class RerankerArguments:
+ max_length: int = field(default=512)
+ rerank_topk: int = field(default=3)
+ rerank_model_name_or_path: str = field(default="cross-encoder/ms-marco-MiniLM-L12-v2")
+ batch_size: int = field(default=32)
+ reranker_type: str = field(default="sentence_transformer")
+
+def get_reranker(config):
+ if config.reranker_type == "sentence_transformer":
+ return SentenceTransformerCrossEncoder.load(
+ config.rerank_model_name_or_path,
+ batch_size=config.batch_size,
+ device="cuda" if torch.cuda.is_available() else "cpu"
+ )
+ else:
+ raise ValueError(f"Unknown reranker type: {config.reranker_type}")
+
+
+app = FastAPI()
+
+@app.post("/rerank")
+def rerank_endpoint(request: RerankRequest):
+ """
+ Endpoint that accepts queries and performs retrieval.
+ Input format:
+ {
+ "queries": ["What is Python?", "Tell me about neural networks."],
+ "documents": [[doc_item_1, ..., doc_item_k], [doc_item_1, ..., doc_item_k]],
+ "rerank_topk": 3,
+ "return_scores": true
+ }
+ """
+ if not request.rerank_topk:
+ request.rerank_topk = config.rerank_topk # fallback to default
+
+ # Perform batch re reranking
+ # doc_scores already sorted by score
+ query_to_doc_scores = reranker.rerank(request.queries, request.documents)
+
+ # Format response
+ resp = []
+ for _, doc_scores in query_to_doc_scores.items():
+ doc_scores = doc_scores[:request.rerank_topk]
+ if request.return_scores:
+ combined = []
+ for doc, score in doc_scores:
+ combined.append({"document": doc, "score": score})
+ resp.append(combined)
+ else:
+ resp.append([doc for doc, _ in doc_scores])
+ return {"result": resp}
+
+
+if __name__ == "__main__":
+
+ # 1) Build a config (could also parse from arguments).
+ # In real usage, you'd parse your CLI arguments or environment variables.
+ parser = HfArgumentParser((RerankerArguments))
+ config = parser.parse_args_into_dataclasses()[0]
+
+ # 2) Instantiate a global retriever so it is loaded once and reused.
+ reranker = get_reranker(config)
+
+ # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+ uvicorn.run(app, host="0.0.0.0", port=6980)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..125643a7bea6e83c612fe6ed02e25ea1a7464670
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.py
@@ -0,0 +1,368 @@
+import json
+import os
+import warnings
+from typing import List, Dict
+import functools
+from tqdm import tqdm
+from multiprocessing import Pool
+import faiss
+import torch
+import numpy as np
+from transformers import AutoConfig, AutoTokenizer, AutoModel
+import argparse
+import datasets
+
+
+def load_corpus(corpus_path: str):
+ corpus = datasets.load_dataset(
+ 'json',
+ data_files=corpus_path,
+ split="train",
+ num_proc=4)
+ return corpus
+
+
+def read_jsonl(file_path):
+ data = []
+
+ with open(file_path, "r") as f:
+ readin = f.readlines()
+ for line in readin:
+ data.append(json.loads(line))
+ return data
+
+
+def load_docs(corpus, doc_idxs):
+ results = [corpus[int(idx)] for idx in doc_idxs]
+
+ return results
+
+
+def load_model(
+ model_path: str,
+ use_fp16: bool = False
+ ):
+ model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+ model.eval()
+ model.cuda()
+ if use_fp16:
+ model = model.half()
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+
+ return model, tokenizer
+
+
+def pooling(
+ pooler_output,
+ last_hidden_state,
+ attention_mask = None,
+ pooling_method = "mean"
+ ):
+ if pooling_method == "mean":
+ last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+ elif pooling_method == "cls":
+ return last_hidden_state[:, 0]
+ elif pooling_method == "pooler":
+ return pooler_output
+ else:
+ raise NotImplementedError("Pooling method not implemented!")
+
+
+class Encoder:
+ def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16):
+ self.model_name = model_name
+ self.model_path = model_path
+ self.pooling_method = pooling_method
+ self.max_length = max_length
+ self.use_fp16 = use_fp16
+
+ self.model, self.tokenizer = load_model(model_path=model_path,
+ use_fp16=use_fp16)
+
+ @torch.no_grad()
+ def encode(self, query_list: List[str], is_query=True) -> np.ndarray:
+ # processing query for different encoders
+ if isinstance(query_list, str):
+ query_list = [query_list]
+
+ if "e5" in self.model_name.lower():
+ if is_query:
+ query_list = [f"query: {query}" for query in query_list]
+ else:
+ query_list = [f"passage: {query}" for query in query_list]
+
+ if "bge" in self.model_name.lower():
+ if is_query:
+ query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list]
+
+ inputs = self.tokenizer(query_list,
+ max_length=self.max_length,
+ padding=True,
+ truncation=True,
+ return_tensors="pt"
+ )
+ inputs = {k: v.cuda() for k, v in inputs.items()}
+
+ if "T5" in type(self.model).__name__:
+ # T5-based retrieval model
+ decoder_input_ids = torch.zeros(
+ (inputs['input_ids'].shape[0], 1), dtype=torch.long
+ ).to(inputs['input_ids'].device)
+ output = self.model(
+ **inputs, decoder_input_ids=decoder_input_ids, return_dict=True
+ )
+ query_emb = output.last_hidden_state[:, 0, :]
+
+ else:
+ output = self.model(**inputs, return_dict=True)
+ query_emb = pooling(output.pooler_output,
+ output.last_hidden_state,
+ inputs['attention_mask'],
+ self.pooling_method)
+ if "dpr" not in self.model_name.lower():
+ query_emb = torch.nn.functional.normalize(query_emb, dim=-1)
+
+ query_emb = query_emb.detach().cpu().numpy()
+ query_emb = query_emb.astype(np.float32, order="C")
+ return query_emb
+
+
+class BaseRetriever:
+ """Base object for all retrievers."""
+
+ def __init__(self, config):
+ self.config = config
+ self.retrieval_method = config.retrieval_method
+ self.topk = config.retrieval_topk
+
+ self.index_path = config.index_path
+ self.corpus_path = config.corpus_path
+
+ # self.cache_save_path = os.path.join(config.save_dir, 'retrieval_cache.json')
+
+ def _search(self, query: str, num: int, return_score:bool) -> List[Dict[str, str]]:
+ r"""Retrieve topk relevant documents in corpus.
+ Return:
+ list: contains information related to the document, including:
+ contents: used for building index
+ title: (if provided)
+ text: (if provided)
+ """
+ pass
+
+ def _batch_search(self, query_list, num, return_score):
+ pass
+
+ def search(self, *args, **kwargs):
+ return self._search(*args, **kwargs)
+
+ def batch_search(self, *args, **kwargs):
+ return self._batch_search(*args, **kwargs)
+
+
+class BM25Retriever(BaseRetriever):
+ r"""BM25 retriever based on pre-built pyserini index."""
+
+ def __init__(self, config):
+ super().__init__(config)
+ from pyserini.search.lucene import LuceneSearcher
+ self.searcher = LuceneSearcher(self.index_path)
+ self.contain_doc = self._check_contain_doc()
+ if not self.contain_doc:
+ self.corpus = load_corpus(self.corpus_path)
+ self.max_process_num = 8
+
+ def _check_contain_doc(self):
+ r"""Check if the index contains document content
+ """
+ return self.searcher.doc(0).raw() is not None
+
+ def _search(self, query: str, num: int = None, return_score = False) -> List[Dict[str, str]]:
+ if num is None:
+ num = self.topk
+
+ hits = self.searcher.search(query, num)
+ if len(hits) < 1:
+ if return_score:
+ return [],[]
+ else:
+ return []
+
+ scores = [hit.score for hit in hits]
+ if len(hits) < num:
+ warnings.warn('Not enough documents retrieved!')
+ else:
+ hits = hits[:num]
+
+ if self.contain_doc:
+ all_contents = [json.loads(self.searcher.doc(hit.docid).raw())['contents'] for hit in hits]
+ results = [{'title': content.split("\n")[0].strip("\""),
+ 'text': "\n".join(content.split("\n")[1:]),
+ 'contents': content} for content in all_contents]
+ else:
+ results = load_docs(self.corpus, [hit.docid for hit in hits])
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+ def _batch_search(self, query_list, num: int = None, return_score = False):
+ # TODO: modify batch method
+ results = []
+ scores = []
+ for query in query_list:
+ item_result, item_score = self._search(query, num,True)
+ results.append(item_result)
+ scores.append(item_score)
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+def get_available_gpu_memory():
+ memory_info = []
+ for i in range(torch.cuda.device_count()):
+ total_memory = torch.cuda.get_device_properties(i).total_memory
+ allocated_memory = torch.cuda.memory_allocated(i)
+ free_memory = total_memory - allocated_memory
+ memory_info.append((i, free_memory / 1e9)) # Convert to GB
+ return memory_info
+
+
+class DenseRetriever(BaseRetriever):
+ r"""Dense retriever based on pre-built faiss index."""
+
+ def __init__(self, config: dict):
+ super().__init__(config)
+ self.index = faiss.read_index(self.index_path)
+ if config.faiss_gpu:
+ co = faiss.GpuMultipleClonerOptions()
+ co.useFloat16 = True
+ co.shard = True
+ self.index = faiss.index_cpu_to_all_gpus(self.index, co=co)
+ # self.index = faiss.index_cpu_to_all_gpus(self.index)
+
+ self.corpus = load_corpus(self.corpus_path)
+ self.encoder = Encoder(
+ model_name = self.retrieval_method,
+ model_path = config.retrieval_model_path,
+ pooling_method = config.retrieval_pooling_method,
+ max_length = config.retrieval_query_max_length,
+ use_fp16 = config.retrieval_use_fp16
+ )
+ self.topk = config.retrieval_topk
+ self.batch_size = self.config.retrieval_batch_size
+
+ def _search(self, query: str, num: int = None, return_score = False):
+ if num is None:
+ num = self.topk
+ query_emb = self.encoder.encode(query)
+ scores, idxs = self.index.search(query_emb, k=num)
+ idxs = idxs[0]
+ scores = scores[0]
+
+ results = load_docs(self.corpus, idxs)
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+ def _batch_search(self, query_list: List[str], num: int = None, return_score = False):
+ if isinstance(query_list, str):
+ query_list = [query_list]
+ if num is None:
+ num = self.topk
+
+ batch_size = self.batch_size
+
+ results = []
+ scores = []
+
+ for start_idx in tqdm(range(0, len(query_list), batch_size), desc='Retrieval process: '):
+ query_batch = query_list[start_idx:start_idx + batch_size]
+
+ # from time import time
+ # a = time()
+ batch_emb = self.encoder.encode(query_batch)
+ # b = time()
+ # print(f'################### encode time {b-a} #####################')
+ batch_scores, batch_idxs = self.index.search(batch_emb, k=num)
+ batch_scores = batch_scores.tolist()
+ batch_idxs = batch_idxs.tolist()
+ # print(f'################### search time {time()-b} #####################')
+ # exit()
+
+ flat_idxs = sum(batch_idxs, [])
+ batch_results = load_docs(self.corpus, flat_idxs)
+ batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))]
+
+ scores.extend(batch_scores)
+ results.extend(batch_results)
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+def get_retriever(config):
+ r"""Automatically select retriever class based on config's retrieval method
+
+ Args:
+ config (dict): configuration with 'retrieval_method' key
+
+ Returns:
+ Retriever: retriever instance
+ """
+ if config.retrieval_method == "bm25":
+ return BM25Retriever(config)
+ else:
+ return DenseRetriever(config)
+
+
+def get_dataset(config):
+ """Load dataset from config."""
+
+ split_path = os.path.join(config.dataset_path, f'{config.data_split}.jsonl')
+ return read_jsonl(split_path)
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description = "Retrieval")
+
+ # Basic parameters
+ parser.add_argument('--retrieval_method', type=str)
+ parser.add_argument('--retrieval_topk', type=int, default=10)
+ parser.add_argument('--index_path', type=str, default=None)
+ parser.add_argument('--corpus_path', type=str)
+ parser.add_argument('--dataset_path', default=None, type=str)
+
+ parser.add_argument('--faiss_gpu', default=True, type=bool)
+ parser.add_argument('--data_split', default="train", type=str)
+
+ parser.add_argument('--retrieval_model_path', type=str, default=None)
+ parser.add_argument('--retrieval_pooling_method', default='mean', type=str)
+ parser.add_argument('--retrieval_query_max_length', default=256, type=str)
+ parser.add_argument('--retrieval_use_fp16', action='store_true', default=False)
+ parser.add_argument('--retrieval_batch_size', default=512, type=int)
+
+ args = parser.parse_args()
+
+ args.index_path = os.path.join(args.index_path, f'{args.retrieval_method}_Flat.index') if args.retrieval_method != 'bm25' else os.path.join(args.index_path, 'bm25')
+
+ # load dataset
+ all_split = get_dataset(args)
+
+ input_query = [sample['question'] for sample in all_split[:512]]
+
+ # initialize the retriever and conduct retrieval
+ retriever = get_retriever(args)
+ print('Start Retrieving ...')
+ results, scores = retriever.batch_search(input_query, return_score=True)
+
+ # from IPython import embed
+ # embed()
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5326ea2840f3a816540fea28f8b557ae02291248
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval.sh
@@ -0,0 +1,25 @@
+
+DATA_NAME=nq
+
+DATASET_PATH="/home/peterjin/mnt/data/$DATA_NAME"
+
+SPLIT='test'
+TOPK=3
+
+INDEX_PATH=/home/peterjin/mnt/index/wiki-18
+CORPUS_PATH=/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl
+SAVE_NAME=e5_${TOPK}_wiki18.json
+
+# INDEX_PATH=/home/peterjin/rm_retrieval_corpus/index/wiki-21
+# CORPUS_PATH=/home/peterjin/rm_retrieval_corpus/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl
+# SAVE_NAME=e5_${TOPK}_wiki21.json
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python retrieval.py --retrieval_method e5 \
+ --retrieval_topk $TOPK \
+ --index_path $INDEX_PATH \
+ --corpus_path $CORPUS_PATH \
+ --dataset_path $DATASET_PATH \
+ --data_split $SPLIT \
+ --retrieval_model_path "intfloat/e5-base-v2" \
+ --retrieval_pooling_method "mean" \
+ --retrieval_batch_size 512 \
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py
new file mode 100644
index 0000000000000000000000000000000000000000..de0a4df6d7adc71c8366938572898c6116276c0e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_request.py
@@ -0,0 +1,23 @@
+import requests
+
+# URL for your local FastAPI server
+url = "http://127.0.0.1:8000/retrieve"
+
+# Example payload
+payload = {
+ "queries": ["What is the capital of France?", "Explain neural networks."] * 200,
+ "topk": 5,
+ "return_scores": True
+}
+
+# Send POST request
+response = requests.post(url, json=payload)
+
+# Raise an exception if the request failed
+response.raise_for_status()
+
+# Get the JSON response
+retrieved_data = response.json()
+
+print("Response from server:")
+print(retrieved_data)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9e14f7bcde1c8c50076ccf464e5e5acdc1bdcff
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_rerank_server.py
@@ -0,0 +1,123 @@
+# pip install -U sentence-transformers
+import os
+import re
+import argparse
+from dataclasses import dataclass, field
+from typing import List, Optional
+from collections import defaultdict
+
+import torch
+import numpy as np
+from fastapi import FastAPI
+from pydantic import BaseModel
+from sentence_transformers import CrossEncoder
+
+from retrieval_server import get_retriever, Config as RetrieverConfig
+from rerank_server import SentenceTransformerCrossEncoder
+
+app = FastAPI()
+
+def convert_title_format(text):
+ # Use regex to extract the title and the content
+ match = re.match(r'\(Title:\s*([^)]+)\)\s*(.+)', text, re.DOTALL)
+ if match:
+ title, content = match.groups()
+ return f'\"{title}\"\n{content}'
+ else:
+ return text
+
+# ----------- Combined Request Schema -----------
+class SearchRequest(BaseModel):
+ queries: List[str]
+ topk_retrieval: Optional[int] = 10
+ topk_rerank: Optional[int] = 3
+ return_scores: bool = False
+
+# ----------- Reranker Config Schema -----------
+@dataclass
+class RerankerArguments:
+ max_length: int = field(default=512)
+ rerank_topk: int = field(default=3)
+ rerank_model_name_or_path: str = field(default="cross-encoder/ms-marco-MiniLM-L12-v2")
+ batch_size: int = field(default=32)
+ reranker_type: str = field(default="sentence_transformer")
+
+def get_reranker(config):
+ if config.reranker_type == "sentence_transformer":
+ return SentenceTransformerCrossEncoder.load(
+ config.rerank_model_name_or_path,
+ batch_size=config.batch_size,
+ device="cuda" if torch.cuda.is_available() else "cpu"
+ )
+ else:
+ raise ValueError(f"Unknown reranker type: {config.reranker_type}")
+
+# ----------- Endpoint -----------
+@app.post("/retrieve")
+def search_endpoint(request: SearchRequest):
+ # Step 1: Retrieve documents
+ retrieved_docs = retriever.batch_search(
+ query_list=request.queries,
+ num=request.topk_retrieval,
+ return_score=False
+ )
+
+ # Step 2: Rerank
+ reranked = reranker.rerank(request.queries, retrieved_docs)
+
+ # Step 3: Format response
+ response = []
+ for i, doc_scores in reranked.items():
+ doc_scores = doc_scores[:request.topk_rerank]
+ if request.return_scores:
+ combined = []
+ for doc, score in doc_scores:
+ combined.append({"document": convert_title_format(doc), "score": score})
+ response.append(combined)
+ else:
+ response.append([convert_title_format(doc) for doc, _ in doc_scores])
+
+ return {"result": response}
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser(description="Launch the local faiss retriever.")
+ # retriever
+ parser.add_argument("--index_path", type=str, default="/home/peterjin/mnt/index/wiki-18/e5_Flat.index", help="Corpus indexing file.")
+ parser.add_argument("--corpus_path", type=str, default="/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl", help="Local corpus file.")
+ parser.add_argument("--retrieval_topk", type=int, default=10, help="Number of retrieved passages for one query.")
+ parser.add_argument("--retriever_name", type=str, default="e5", help="Name of the retriever model.")
+ parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Path of the retriever model.")
+ parser.add_argument('--faiss_gpu', action='store_true', help='Use GPU for computation')
+ # reranker
+ parser.add_argument("--reranking_topk", type=int, default=3, help="Number of reranked passages for one query.")
+ parser.add_argument("--reranker_model", type=str, default="cross-encoder/ms-marco-MiniLM-L12-v2", help="Path of the reranker model.")
+ parser.add_argument("--reranker_batch_size", type=int, default=32, help="Batch size for the reranker inference.")
+
+ args = parser.parse_args()
+
+ # ----------- Load Retriever and Reranker -----------
+ retriever_config = RetrieverConfig(
+ retrieval_method = args.retriever_name,
+ index_path=args.index_path,
+ corpus_path=args.corpus_path,
+ retrieval_topk=args.retrieval_topk,
+ faiss_gpu=args.faiss_gpu,
+ retrieval_model_path=args.retriever_model,
+ retrieval_pooling_method="mean",
+ retrieval_query_max_length=256,
+ retrieval_use_fp16=True,
+ retrieval_batch_size=512,
+ )
+ retriever = get_retriever(retriever_config)
+
+ reranker_config = RerankerArguments(
+ rerank_topk = args.reranking_topk,
+ rerank_model_name_or_path = args.reranker_model,
+ batch_size = args.reranker_batch_size,
+ )
+ reranker = get_reranker(reranker_config)
+
+ import uvicorn
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..f39698980c1da3abdf715dcdd78916cf1dbdc935
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/retrieval_server.py
@@ -0,0 +1,392 @@
+import json
+import os
+import warnings
+from typing import List, Dict, Optional
+import argparse
+
+import faiss
+import torch
+import numpy as np
+from transformers import AutoConfig, AutoTokenizer, AutoModel
+from tqdm import tqdm
+import datasets
+
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+def load_corpus(corpus_path: str):
+ corpus = datasets.load_dataset(
+ 'json',
+ data_files=corpus_path,
+ split="train",
+ num_proc=4
+ )
+ return corpus
+
+def read_jsonl(file_path):
+ data = []
+ with open(file_path, "r") as f:
+ for line in f:
+ data.append(json.loads(line))
+ return data
+
+def load_docs(corpus, doc_idxs):
+ results = [corpus[int(idx)] for idx in doc_idxs]
+ return results
+
+def load_model(model_path: str, use_fp16: bool = False):
+ model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+ model.eval()
+ model.cuda()
+ if use_fp16:
+ model = model.half()
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+ return model, tokenizer
+
+def pooling(
+ pooler_output,
+ last_hidden_state,
+ attention_mask = None,
+ pooling_method = "mean"
+):
+ if pooling_method == "mean":
+ last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+ elif pooling_method == "cls":
+ return last_hidden_state[:, 0]
+ elif pooling_method == "pooler":
+ return pooler_output
+ else:
+ raise NotImplementedError("Pooling method not implemented!")
+
+class Encoder:
+ def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16):
+ self.model_name = model_name
+ self.model_path = model_path
+ self.pooling_method = pooling_method
+ self.max_length = max_length
+ self.use_fp16 = use_fp16
+
+ self.model, self.tokenizer = load_model(model_path=model_path, use_fp16=use_fp16)
+ self.model.eval()
+
+ @torch.no_grad()
+ def encode(self, query_list: List[str], is_query=True) -> np.ndarray:
+ # processing query for different encoders
+ if isinstance(query_list, str):
+ query_list = [query_list]
+
+ if "e5" in self.model_name.lower():
+ if is_query:
+ query_list = [f"query: {query}" for query in query_list]
+ else:
+ query_list = [f"passage: {query}" for query in query_list]
+
+ if "bge" in self.model_name.lower():
+ if is_query:
+ query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list]
+
+ inputs = self.tokenizer(query_list,
+ max_length=self.max_length,
+ padding=True,
+ truncation=True,
+ return_tensors="pt"
+ )
+ inputs = {k: v.cuda() for k, v in inputs.items()}
+
+ if "T5" in type(self.model).__name__:
+ # T5-based retrieval model
+ decoder_input_ids = torch.zeros(
+ (inputs['input_ids'].shape[0], 1), dtype=torch.long
+ ).to(inputs['input_ids'].device)
+ output = self.model(
+ **inputs, decoder_input_ids=decoder_input_ids, return_dict=True
+ )
+ query_emb = output.last_hidden_state[:, 0, :]
+ else:
+ output = self.model(**inputs, return_dict=True)
+ query_emb = pooling(output.pooler_output,
+ output.last_hidden_state,
+ inputs['attention_mask'],
+ self.pooling_method)
+ if "dpr" not in self.model_name.lower():
+ query_emb = torch.nn.functional.normalize(query_emb, dim=-1)
+
+ query_emb = query_emb.detach().cpu().numpy()
+ query_emb = query_emb.astype(np.float32, order="C")
+
+ del inputs, output
+ torch.cuda.empty_cache()
+
+ return query_emb
+
+class BaseRetriever:
+ def __init__(self, config):
+ self.config = config
+ self.retrieval_method = config.retrieval_method
+ self.topk = config.retrieval_topk
+
+ self.index_path = config.index_path
+ self.corpus_path = config.corpus_path
+
+ def _search(self, query: str, num: int, return_score: bool):
+ raise NotImplementedError
+
+ def _batch_search(self, query_list: List[str], num: int, return_score: bool):
+ raise NotImplementedError
+
+ def search(self, query: str, num: int = None, return_score: bool = False):
+ return self._search(query, num, return_score)
+
+ def batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+ return self._batch_search(query_list, num, return_score)
+
+class BM25Retriever(BaseRetriever):
+ def __init__(self, config):
+ super().__init__(config)
+ from pyserini.search.lucene import LuceneSearcher
+ self.searcher = LuceneSearcher(self.index_path)
+ self.contain_doc = self._check_contain_doc()
+ if not self.contain_doc:
+ self.corpus = load_corpus(self.corpus_path)
+ self.max_process_num = 8
+
+ def _check_contain_doc(self):
+ return self.searcher.doc(0).raw() is not None
+
+ def _search(self, query: str, num: int = None, return_score: bool = False):
+ if num is None:
+ num = self.topk
+ hits = self.searcher.search(query, num)
+ if len(hits) < 1:
+ if return_score:
+ return [], []
+ else:
+ return []
+ scores = [hit.score for hit in hits]
+ if len(hits) < num:
+ warnings.warn('Not enough documents retrieved!')
+ else:
+ hits = hits[:num]
+
+ if self.contain_doc:
+ all_contents = [
+ json.loads(self.searcher.doc(hit.docid).raw())['contents']
+ for hit in hits
+ ]
+ results = [
+ {
+ 'title': content.split("\n")[0].strip("\""),
+ 'text': "\n".join(content.split("\n")[1:]),
+ 'contents': content
+ }
+ for content in all_contents
+ ]
+ else:
+ results = load_docs(self.corpus, [hit.docid for hit in hits])
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+ def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+ results = []
+ scores = []
+ for query in query_list:
+ item_result, item_score = self._search(query, num, True)
+ results.append(item_result)
+ scores.append(item_score)
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+class DenseRetriever(BaseRetriever):
+ def __init__(self, config):
+ super().__init__(config)
+ self.index = faiss.read_index(self.index_path)
+ if config.faiss_gpu:
+ co = faiss.GpuMultipleClonerOptions()
+ co.useFloat16 = True
+ co.shard = True
+ self.index = faiss.index_cpu_to_all_gpus(self.index, co=co)
+
+ self.corpus = load_corpus(self.corpus_path)
+ self.encoder = Encoder(
+ model_name = self.retrieval_method,
+ model_path = config.retrieval_model_path,
+ pooling_method = config.retrieval_pooling_method,
+ max_length = config.retrieval_query_max_length,
+ use_fp16 = config.retrieval_use_fp16
+ )
+ self.topk = config.retrieval_topk
+ self.batch_size = config.retrieval_batch_size
+
+ def _search(self, query: str, num: int = None, return_score: bool = False):
+ if num is None:
+ num = self.topk
+ query_emb = self.encoder.encode(query)
+ scores, idxs = self.index.search(query_emb, k=num)
+ idxs = idxs[0]
+ scores = scores[0]
+ results = load_docs(self.corpus, idxs)
+ if return_score:
+ return results, scores.tolist()
+ else:
+ return results
+
+ def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+ if isinstance(query_list, str):
+ query_list = [query_list]
+ if num is None:
+ num = self.topk
+
+ results = []
+ scores = []
+ for start_idx in tqdm(range(0, len(query_list), self.batch_size), desc='Retrieval process: '):
+ query_batch = query_list[start_idx:start_idx + self.batch_size]
+ batch_emb = self.encoder.encode(query_batch)
+ batch_scores, batch_idxs = self.index.search(batch_emb, k=num)
+ batch_scores = batch_scores.tolist()
+ batch_idxs = batch_idxs.tolist()
+
+ # load_docs is not vectorized, but is a python list approach
+ flat_idxs = sum(batch_idxs, [])
+ batch_results = load_docs(self.corpus, flat_idxs)
+ # chunk them back
+ batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))]
+
+ results.extend(batch_results)
+ scores.extend(batch_scores)
+
+ del batch_emb, batch_scores, batch_idxs, query_batch, flat_idxs, batch_results
+ torch.cuda.empty_cache()
+
+ if return_score:
+ return results, scores
+ else:
+ return results
+
+def get_retriever(config):
+ if config.retrieval_method == "bm25":
+ return BM25Retriever(config)
+ else:
+ return DenseRetriever(config)
+
+
+#####################################
+# FastAPI server below
+#####################################
+
+class Config:
+ """
+ Minimal config class (simulating your argparse)
+ Replace this with your real arguments or load them dynamically.
+ """
+ def __init__(
+ self,
+ retrieval_method: str = "bm25",
+ retrieval_topk: int = 10,
+ index_path: str = "./index/bm25",
+ corpus_path: str = "./data/corpus.jsonl",
+ dataset_path: str = "./data",
+ data_split: str = "train",
+ faiss_gpu: bool = True,
+ retrieval_model_path: str = "./model",
+ retrieval_pooling_method: str = "mean",
+ retrieval_query_max_length: int = 256,
+ retrieval_use_fp16: bool = False,
+ retrieval_batch_size: int = 128
+ ):
+ self.retrieval_method = retrieval_method
+ self.retrieval_topk = retrieval_topk
+ self.index_path = index_path
+ self.corpus_path = corpus_path
+ self.dataset_path = dataset_path
+ self.data_split = data_split
+ self.faiss_gpu = faiss_gpu
+ self.retrieval_model_path = retrieval_model_path
+ self.retrieval_pooling_method = retrieval_pooling_method
+ self.retrieval_query_max_length = retrieval_query_max_length
+ self.retrieval_use_fp16 = retrieval_use_fp16
+ self.retrieval_batch_size = retrieval_batch_size
+
+
+class QueryRequest(BaseModel):
+ queries: List[str]
+ topk: Optional[int] = None
+ return_scores: bool = False
+
+
+app = FastAPI()
+
+@app.post("/retrieve")
+def retrieve_endpoint(request: QueryRequest):
+ """
+ Endpoint that accepts queries and performs retrieval.
+ Input format:
+ {
+ "queries": ["What is Python?", "Tell me about neural networks."],
+ "topk": 3,
+ "return_scores": true
+ }
+ """
+ if not request.topk:
+ request.topk = config.retrieval_topk # fallback to default
+
+ # Perform batch retrieval
+ results, scores = retriever.batch_search(
+ query_list=request.queries,
+ num=request.topk,
+ return_score=request.return_scores
+ )
+
+ # Format response
+ resp = []
+ for i, single_result in enumerate(results):
+ if request.return_scores:
+ # If scores are returned, combine them with results
+ combined = []
+ for doc, score in zip(single_result, scores[i]):
+ combined.append({"document": doc, "score": score})
+ resp.append(combined)
+ else:
+ resp.append(single_result)
+ return {"result": resp}
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser(description="Launch the local faiss retriever.")
+ parser.add_argument("--index_path", type=str, default="/home/peterjin/mnt/index/wiki-18/e5_Flat.index", help="Corpus indexing file.")
+ parser.add_argument("--corpus_path", type=str, default="/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl", help="Local corpus file.")
+ parser.add_argument("--topk", type=int, default=3, help="Number of retrieved passages for one query.")
+ parser.add_argument("--retriever_name", type=str, default="e5", help="Name of the retriever model.")
+ parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Path of the retriever model.")
+ parser.add_argument('--faiss_gpu', action='store_true', help='Use GPU for computation')
+
+ args = parser.parse_args()
+
+ # 1) Build a config (could also parse from arguments).
+ # In real usage, you'd parse your CLI arguments or environment variables.
+ config = Config(
+ retrieval_method = args.retriever_name, # or "dense"
+ index_path=args.index_path,
+ corpus_path=args.corpus_path,
+ retrieval_topk=args.topk,
+ faiss_gpu=args.faiss_gpu,
+ retrieval_model_path=args.retriever_model,
+ retrieval_pooling_method="mean",
+ retrieval_query_max_length=256,
+ retrieval_use_fp16=True,
+ retrieval_batch_size=512,
+ )
+
+ # 2) Instantiate a global retriever so it is loaded once and reused.
+ retriever = get_retriever(config)
+
+ # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py b/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a10de3fa44aa6af20a12417ed9cf215319ad6f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/search_r1/search/serp_search_server.py
@@ -0,0 +1,112 @@
+import os
+import requests
+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List, Optional, Dict
+from concurrent.futures import ThreadPoolExecutor
+import argparse
+import uvicorn
+
+parser = argparse.ArgumentParser(description="Launch online search server.")
+parser.add_argument('--search_url', type=str, required=True,
+ help="URL for search engine (e.g. https://serpapi.com/search)")
+parser.add_argument('--topk', type=int, default=3,
+ help="Number of results to return per query")
+parser.add_argument('--serp_api_key', type=str, default=None,
+ help="SerpAPI key for online search")
+parser.add_argument('--serp_engine', type=str, default="google",
+ help="SerpAPI engine for online search")
+args = parser.parse_args()
+
+# --- Config ---
+class OnlineSearchConfig:
+ def __init__(
+ self,
+ search_url: str = "https://serpapi.com/search",
+ topk: int = 3,
+ serp_api_key: Optional[str] = None,
+ serp_engine: Optional[str] = None,
+ ):
+ self.search_url = search_url
+ self.topk = topk
+ self.serp_api_key = serp_api_key
+ self.serp_engine = serp_engine
+
+
+# --- Online Search Wrapper ---
+class OnlineSearchEngine:
+ def __init__(self, config: OnlineSearchConfig):
+ self.config = config
+
+ def _search_query(self, query: str):
+ params = {
+ "engine": self.config.serp_engine,
+ "q": query,
+ "api_key": self.config.serp_api_key,
+ }
+ response = requests.get(self.config.search_url, params=params)
+ return response.json()
+
+ def batch_search(self, queries: List[str]):
+ results = []
+ with ThreadPoolExecutor() as executor:
+ for result in executor.map(self._search_query, queries):
+ results.append(self._process_result(result))
+ return results
+
+ def _process_result(self, search_result: Dict):
+ results = []
+
+ answer_box = search_result.get('answer_box', {})
+ if answer_box:
+ title = answer_box.get('title', 'No title.')
+ snippet = answer_box.get('snippet', 'No snippet available.')
+ results.append({
+ 'document': {"contents": f'\"{title}\"\n{snippet}'},
+ })
+
+ organic_results = search_result.get('organic_results', [])
+ for _, result in enumerate(organic_results[:self.config.topk]):
+ title = result.get('title', 'No title.')
+ snippet = result.get('snippet', 'No snippet available.')
+ results.append({
+ 'document': {"contents": f'\"{title}\"\n{snippet}'},
+ })
+
+ related_results = search_result.get('related_questions', [])
+ for _, result in enumerate(related_results[:self.config.topk]):
+ title = result.get('question', 'No title.') # question is the title here
+ snippet = result.get('snippet', 'No snippet available.')
+ results.append({
+ 'document': {"contents": f'\"{title}\"\n{snippet}'},
+ })
+
+ return results
+
+
+# --- FastAPI Setup ---
+app = FastAPI(title="Online Search Proxy Server")
+
+class SearchRequest(BaseModel):
+ queries: List[str]
+
+# Instantiate global config + engine
+config = OnlineSearchConfig(
+ search_url=args.search_url,
+ topk=args.topk,
+ serp_api_key=args.serp_api_key,
+ serp_engine=args.serp_engine,
+)
+engine = OnlineSearchEngine(config)
+
+# --- Routes ---
+@app.post("/retrieve")
+def search_endpoint(request: SearchRequest):
+ results = engine.batch_search(request.queries)
+ return {"result": results}
+
+## return {"result": List[List[{'document': {"id": xx, "content": "title" + \n + "content"}, 'score': xx}]]}
+
+if __name__ == "__main__":
+ # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/code/RL_model/verl/Search-R1/setup.py b/code/RL_model/verl/Search-R1/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aab68a3e8959317a9fbec484b9623912e633250
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/setup.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# setup.py is the fallback installation script when pyproject.toml does not work
+from setuptools import setup, find_packages
+import os
+
+version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+
+with open(os.path.join(version_folder, 'verl/version/version')) as f:
+ __version__ = f.read().strip()
+
+
+with open('requirements.txt') as f:
+ required = f.read().splitlines()
+ install_requires = [item.strip() for item in required if item.strip()[0] != '#']
+
+extras_require = {
+ 'test': ['pytest', 'yapf']
+}
+
+from pathlib import Path
+this_directory = Path(__file__).parent
+long_description = (this_directory / "README.md").read_text()
+
+setup(
+ name='verl',
+ version=__version__,
+ package_dir={'': '.'},
+ packages=find_packages(where='.'),
+ url='https://github.com/volcengine/verl',
+ license='Apache 2.0',
+ author='Bytedance - Seed - MLSys',
+ author_email='zhangchi.usc1992@bytedance.com, gmsheng@connect.hku.hk',
+ description='veRL: Volcano Engine Reinforcement Learning for LLM',
+ install_requires=install_requires,
+ extras_require=extras_require,
+ package_data={'': ['version/*'],
+ 'verl': ['trainer/config/*.yaml'],},
+ include_package_data=True,
+ long_description=long_description,
+ long_description_content_type='text/markdown'
+)
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/train_grpo.sh b/code/RL_model/verl/Search-R1/train_grpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..51acdc48bc0fc072c1ac4a6e7fd394204bdcfb03
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/train_grpo.sh
@@ -0,0 +1,46 @@
+
+export PYTORCH_CUDA_ALLOC_CONF=""
+export EXPERIMENT_NAME=llm_guard_3B_10k_v2
+export WAND_PROJECT='guard'
+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+export CUDA_VISIBLE_DEVICES=1,2
+export VLLM_ATTENTION_BACKEND=FLASH_ATTN
+
+
+PYTHONUNBUFFERED=1 NCCL_P2P_DISABLE=1 NCCL_IB_DISABLE=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet \
+ data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet \
+ data.train_batch_size=64 \
+ data.val_batch_size=64 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=1024 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=grpo \
+ actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=False \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+ +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=64 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=64 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ trainer.logger=['wandb'] \
+ trainer.n_gpus_per_node=2 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=50 \
+ trainer.project_name=$WANDB_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ do_search=false \
+ max_turns=1 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/train_ppo.sh b/code/RL_model/verl/Search-R1/train_ppo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..961fa6e98ff189786e3545748729c27e2fb9be05
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/train_ppo.sh
@@ -0,0 +1,90 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export DATA_DIR='data/nq_search'
+
+WAND_PROJECT='Search-R1'
+
+export BASE_MODEL='meta-llama/Llama-3.2-3B'
+export EXPERIMENT_NAME=nq-search-r1-ppo-llama3.2-3b-em
+# export BASE_MODEL='meta-llama/Llama-3.2-3B-Instruct'
+# export EXPERIMENT_NAME=nq-search-r1-ppo-llama3.2-3b-it-em
+# export BASE_MODEL='meta-llama/Llama-3.1-8B'
+# export EXPERIMENT_NAME=nq-search-r1-ppo-llama3.1-8b-em
+# export BASE_MODEL='meta-llama/Llama-3.1-8B-Instruct'
+# export EXPERIMENT_NAME=nq-search-r1-ppo-llama3.1-8b-it-em
+
+# export BASE_MODEL='Qwen/Qwen2.5-3B'
+# export EXPERIMENT_NAME=nq-search-r1-ppo-qwen2.5-3b-em
+# export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
+# export EXPERIMENT_NAME=nq-search-r1-ppo-qwen2.5-3b-it-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B'
+# export EXPERIMENT_NAME=nq-search-r1-ppo-qwen2.5-7b-em
+# export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
+# export EXPERIMENT_NAME=nq-search-r1-ppo-qwen2.5-7b-it-em
+
+# set -x
+export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
+
+# max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
+
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=$DATA_DIR/train.parquet \
+ data.val_files=$DATA_DIR/test.parquet \
+ data.train_data_num=null \
+ data.val_data_num=null \
+ data.train_batch_size=512 \
+ data.val_batch_size=256 \
+ data.max_prompt_length=4096 \
+ data.max_response_length=500 \
+ data.max_start_length=2048 \
+ data.max_obs_length=500 \
+ data.shuffle_train_dataloader=True \
+ algorithm.adv_estimator=gae \
+ actor_rollout_ref.model.path=$BASE_MODEL \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.rollout.n_agent=1 \
+ actor_rollout_ref.rollout.temperature=1 \
+ actor_rollout_ref.actor.state_masking=true \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.optim.lr_warmup_steps_ratio=0.015 \
+ critic.model.path=$BASE_MODEL \
+ critic.model.enable_gradient_checkpointing=true \
+ critic.ppo_micro_batch_size=8 \
+ critic.model.fsdp_config.param_offload=true \
+ critic.model.fsdp_config.grad_offload=true \
+ critic.model.fsdp_config.optimizer_offload=true \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ algorithm.no_think_rl=false \
+ trainer.critic_warmup=0 \
+ trainer.logger=['wandb'] \
+ +trainer.val_only=false \
+ +trainer.val_before_train=true \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=100 \
+ trainer.test_freq=50 \
+ trainer.project_name=$WAND_PROJECT \
+ trainer.experiment_name=$EXPERIMENT_NAME \
+ trainer.total_epochs=15 \
+ trainer.total_training_steps=1005 \
+ trainer.default_hdfs_dir=null \
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
+ max_turns=2 \
+ retriever.url="http://127.0.0.1:8000/retrieve" \
+ retriever.topk=3 \
+ 2>&1 | tee $EXPERIMENT_NAME.log
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/__init__.py b/code/RL_model/verl/Search-R1/verl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f068717761543cde8dd59ad08b42465160893bb3
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+
+with open(os.path.join(version_folder, 'version/version')) as f:
+ __version__ = f.read().strip()
+
+from .protocol import DataProto
+
+from .utils.logging_utils import set_basic_config
+import logging
+
+set_basic_config(level=logging.WARNING)
diff --git a/code/RL_model/verl/Search-R1/verl/models/README.md b/code/RL_model/verl/Search-R1/verl/models/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..677b92f3871aa2f76a7f5bd8c07d1050bab14564
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/README.md
@@ -0,0 +1,35 @@
+# Models
+Common modelzoo such as huggingface/transformers stuggles when using Pytorch native model parallelism. Following the design principle of vLLM, we keep a simple, parallelizable, highly-optimized with packed inputs in verl.
+## Adding a New Huggingface Model
+### Step 1: Copy the model file from HF to verl
+- Add a new file under verl/models/hf
+- Copy ONLY the model file from huggingface/transformers/models to verl/models/hf
+
+### Step 2: Modify the model file to use packed inputs
+- Remove all the code related to inference (kv cache)
+- Modify the inputs to include only
+ - input_ids (total_nnz,)
+ - cu_seqlens (total_nnz + 1,)
+ - max_seqlen_in_batch: int
+- Note that this requires using flash attention with causal mask.
+
+### Step 2.5: Add tests
+- Add a test to compare this version and the huggingface version
+- Following the infrastructure and add tests to tests/models/hf
+
+### Step 3: Add a function to apply tensor parallelism
+- Please follow
+ - https://pytorch.org/docs/stable/distributed.tensor.parallel.html
+ - https://pytorch.org/tutorials/intermediate/TP_tutorial.html
+- General comments
+ - Tensor Parallelism in native Pytorch is NOT auto-parallelism. The way it works is to specify how model parameters and input/output reshards using configs. These configs are then registered as hooks to perform input/output resharding before/after model forward.
+
+### Step 4: Add a function to apply data parallelism
+- Please use FSDP2 APIs
+- See demo here https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py#L413
+
+### Step 5: Add a function to apply pipeline parallelism
+- Comes in Pytorch 2.4
+- Currently only in alpha in nightly version
+- Check torchtitan for more details
+
diff --git a/code/RL_model/verl/Search-R1/verl/models/__init__.py b/code/RL_model/verl/Search-R1/verl/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/__init__.py b/code/RL_model/verl/Search-R1/verl/models/llama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/__init__.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b188b3ee62cdfb978fc482984b423ce12e40a962
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling_llama_megatron import (
+ # original model with megatron
+ ParallelLlamaModel,
+ ParallelLlamaForCausalLM,
+ # rmpad with megatron
+ ParallelLlamaForCausalLMRmPad,
+ ParallelLlamaForValueRmPad,
+ # rmpad with megatron and pipeline parallelism
+ ParallelLlamaForCausalLMRmPadPP,
+ ParallelLlamaForValueRmPadPP)
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/__init__.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/llama_loader.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/llama_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..00fb0a9c668be28b4e13abb9a24e42bd7498d088
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/llama_loader.py
@@ -0,0 +1,446 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import time
+from typing import Dict, Any, Callable, Optional
+import torch.distributed as dist
+
+
+def _megatron_calc_layer_map(config):
+ """Calculate the mapping of global layer_idx to local layer_idx
+ Returns:
+ layer_map (Dict: int -> tuple(int, int, int)):
+ mapping from the global layer index to
+ a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+ """
+ import megatron
+ from megatron.core import mpu
+
+ pp_size = mpu.get_pipeline_model_parallel_world_size()
+ virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+
+ layer_map = dict()
+ num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+ assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+ for pp_rank_idx in range(pp_size):
+ for virtual_pp_rank_idx in range(virtual_pp_size):
+ layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
+ pp_rank_idx * num_layers_per_model)
+ for layer_idx in range(num_layers_per_model):
+ layer_map[layer_offset + layer_idx] = (
+ pp_rank_idx,
+ virtual_pp_rank_idx,
+ layer_idx,
+ )
+ return layer_map
+
+
+def load_state_dict_to_megatron_llama(state_dict, wrapped_models, config, params_dtype, is_value_model=False):
+ """Load merged state_dict to sharded Megatron module in training.
+ """
+ import megatron
+ from megatron.core import mpu
+ from megatron.utils import print_rank_0, unwrap_model
+ from megatron.core.transformer.module import Float16Module
+ from megatron.core import DistributedDataParallel as LocalDDP
+ from torch.nn.parallel import DistributedDataParallel as torchDDP
+
+ start_time = time.time()
+
+ def _get_gpt_model(model):
+ return model
+
+ def broadcast_params(module):
+ for param in module.parameters():
+ torch.distributed.broadcast(param.data,
+ src=mpu.get_data_parallel_src_rank(),
+ group=mpu.get_data_parallel_group())
+
+ dp_rank = mpu.get_data_parallel_rank()
+ pp_rank = mpu.get_pipeline_model_parallel_rank()
+ pp_size = mpu.get_pipeline_model_parallel_world_size()
+ virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+ mp_group = mpu.get_model_parallel_group()
+
+ if torch.distributed.get_rank() == 0:
+ assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+ assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+ assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+
+ if not isinstance(wrapped_models, (list, tuple)):
+ wrapped_models = list(wrapped_models)
+
+ assert len(wrapped_models) == virtual_pp_size
+ num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+ assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+ models = [None] * len(wrapped_models)
+
+ for i, wrapped_model in enumerate(wrapped_models):
+ models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+ gpt_model_module = _get_gpt_model(models[i])
+ assert len(gpt_model_module.model.layers) == num_layers_per_model
+
+ def _broadcast_tensor(tensor, name) -> torch.Tensor:
+ """broadcast tensor from rank0 across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ if torch.distributed.get_rank() == 0:
+ if name in state_dict:
+ weight = state_dict[name]
+ tensor_shape = weight.shape
+ else:
+ tensor_shape = None
+ else:
+ weight = None
+ tensor_shape = None
+
+ obj_list = [tensor_shape]
+ dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+ tensor_shape = obj_list[0]
+
+ if tensor_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tensor:[{name}] not in state_dict, skip load")
+ return
+
+ if tensor is None:
+ tensor = torch.empty(
+ tensor_shape,
+ dtype=params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+ if torch.distributed.get_rank() == 0:
+ tensor.data.copy_(weight)
+ dist.broadcast(tensor, src=0, group=mp_group)
+
+ def _broadcast_tp_shard_tensor_vocab(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+ """broadcast tensor in tp shards across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+
+ if torch.distributed.get_rank() == 0:
+ if name in state_dict:
+ full_weight = state_dict[name]
+
+ if mutate_func is not None:
+ full_weight = mutate_func(full_weight)
+ tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+ chunk_shape = tensor_chunk[0].shape
+ else:
+ chunk_shape = None
+ else:
+ chunk_shape = None
+
+ obj_list = [chunk_shape]
+ dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+ chunk_shape = obj_list[0]
+ if chunk_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+ return
+
+ if tensor is None:
+ sync_tensor = torch.empty(
+ chunk_shape,
+ dtype=params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+ else:
+ assert (tensor.shape == chunk_shape
+ ), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+ sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+ for i in range(tp_size):
+ if torch.distributed.get_rank() == 0:
+ sync_tensor.data.copy_(tensor_chunk[i])
+ dist.broadcast(sync_tensor, src=0, group=mp_group)
+ if (i == tp_rank) and (tensor is not None):
+ tensor.data.copy_(sync_tensor)
+
+ def _broadcast_tp_shard_tensor(tensor, name, chunk_dim=0, mutate_func=None) -> torch.Tensor:
+ """broadcast tensor in tp shards across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+
+ if torch.distributed.get_rank() == 0:
+ if name in state_dict:
+ full_weight = state_dict[name]
+ if mutate_func is not None:
+ full_weight = mutate_func(full_weight)
+ tensor_chunk = torch.chunk(full_weight, tp_size, dim=chunk_dim)
+ chunk_shape = tensor_chunk[0].shape
+ else:
+ chunk_shape = None
+ else:
+ chunk_shape = None
+
+ obj_list = [chunk_shape]
+ dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+ chunk_shape = obj_list[0]
+ if chunk_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+ return
+
+ if tensor is None:
+ sync_tensor = torch.empty(
+ chunk_shape,
+ dtype=params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+ else:
+ assert (tensor.shape == chunk_shape
+ ), f"rank #{torch.distributed.get_rank()} tensor {name} shape {tensor.shape} != {chunk_shape}"
+ sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+ for i in range(tp_size):
+ if torch.distributed.get_rank() == 0:
+ sync_tensor.data.copy_(tensor_chunk[i])
+ dist.broadcast(sync_tensor, src=0, group=mp_group)
+ if (i == tp_rank) and (tensor is not None):
+ tensor.data.copy_(sync_tensor)
+
+ def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name) -> torch.Tensor:
+ """broadcast tensor in tp shards across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+
+ if torch.distributed.get_rank() == 0:
+ gate_weight = state_dict[gate_name]
+ up_weight = state_dict[up_name]
+ new_gate_up_weight = torch.empty(config.intermediate_size * 2,
+ config.hidden_size,
+ dtype=params_dtype,
+ device=torch.cuda.current_device())
+ for i in range(tp_size):
+ intermediate_size_tp = config.intermediate_size // tp_size
+ gate_weight_tp = gate_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
+ up_weight_tp = up_weight[i * intermediate_size_tp:(i + 1) * intermediate_size_tp]
+ new_gate_up_weight[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)].copy_(
+ torch.cat([gate_weight_tp, up_weight_tp], dim=0))
+
+ tensor_chunk = torch.chunk(new_gate_up_weight, tp_size, dim=0)
+ chunk_shape = tensor_chunk[0].shape
+ else:
+ chunk_shape = None
+
+ obj_list = [chunk_shape]
+ dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+ chunk_shape = obj_list[0]
+ if chunk_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not in state_dict, skip loading")
+ return
+
+ if tensor is None:
+ sync_tensor = torch.empty(
+ chunk_shape,
+ dtype=params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+ else:
+ assert (
+ tensor.shape == chunk_shape
+ ), f"rank #{torch.distributed.get_rank() == 0:} tensor {gate_name, up_name} shape {tensor.shape} != {chunk_shape}"
+ sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+ for i in range(tp_size):
+ if torch.distributed.get_rank() == 0:
+ sync_tensor.data.copy_(tensor_chunk[i])
+ dist.broadcast(sync_tensor, src=0, group=mp_group)
+ if (i == tp_rank) and (tensor is not None):
+ tensor.data.copy_(sync_tensor)
+
+ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name) -> torch.Tensor:
+ """broadcast tensor in tp shards across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+
+ if torch.distributed.get_rank() == 0:
+ assert (q_name in state_dict and k_name in state_dict and v_name in state_dict)
+ full_weight_q = state_dict[q_name]
+ full_weight_k = state_dict[k_name]
+ full_weight_v = state_dict[v_name]
+
+ hidden_size_per_head = config.hidden_size // config.num_attention_heads
+
+ if config.num_key_value_heads >= tp_size:
+ q_size_tp = config.hidden_size // tp_size
+ kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+ total_size = q_size_tp + 2 * kv_size_tp
+ new_weight_qkv = torch.empty(total_size * tp_size,
+ config.hidden_size,
+ dtype=params_dtype,
+ device=torch.cuda.current_device())
+ for i in range(tp_size):
+ q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
+ k_part = full_weight_k[i * kv_size_tp:(i + 1) * kv_size_tp]
+ v_part = full_weight_v[i * kv_size_tp:(i + 1) * kv_size_tp]
+ new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
+ dim=0))
+
+ else:
+ q_size_tp = config.hidden_size // tp_size
+ kv_size_tp = hidden_size_per_head
+ total_size = q_size_tp + 2 * kv_size_tp
+ new_weight_qkv = torch.empty(total_size * tp_size,
+ config.hidden_size,
+ dtype=params_dtype,
+ device=torch.cuda.current_device())
+ for i in range(tp_size):
+ q_part = full_weight_q[i * q_size_tp:(i + 1) * q_size_tp]
+ start_idx = i * config.num_key_value_heads // tp_size * hidden_size_per_head
+ end_idx = (i * config.num_key_value_heads // tp_size + 1) * hidden_size_per_head
+ k_part = full_weight_k[start_idx:end_idx]
+ v_part = full_weight_v[start_idx:end_idx]
+ new_weight_qkv[i * total_size:(i + 1) * total_size].copy_(torch.cat([q_part, k_part, v_part],
+ dim=0))
+
+ tensor_chunk = torch.chunk(new_weight_qkv, tp_size, dim=0)
+ chunk_shape = tensor_chunk[0].shape
+ else:
+ chunk_shape = None
+
+ obj_list = [chunk_shape]
+ dist.broadcast_object_list(obj_list, src=0, group=mp_group)
+ chunk_shape = obj_list[0]
+ if chunk_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tp_shard tensor:[{name}] not in state_dict, skip loading")
+ return
+
+ if tensor is None:
+ sync_tensor = torch.empty(
+ chunk_shape,
+ dtype=params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+ else:
+ assert (tensor.shape == chunk_shape
+ ), f"rank #{torch.distributed.get_rank()} tensor {q_name} shape {tensor.shape} != {chunk_shape}"
+ sync_tensor = torch.empty_like(tensor, device=torch.cuda.current_device(), requires_grad=False)
+
+ for i in range(tp_size):
+ if torch.distributed.get_rank() == 0:
+ sync_tensor.data.copy_(tensor_chunk[i])
+ dist.broadcast(sync_tensor, src=0, group=mp_group)
+ if (i == tp_rank) and (tensor is not None):
+ tensor.data.copy_(sync_tensor)
+
+ if dp_rank == 0:
+ # Embeddings
+ # -------------------
+ print_rank_0("loading embeddings...")
+ gpt_model_module = _get_gpt_model(models[0])
+ embed_tokens_weight = None
+ if pp_rank == 0:
+ embed_tokens_weight = gpt_model_module.model.embed_tokens.weight
+ _broadcast_tp_shard_tensor_vocab(embed_tokens_weight, "model.embed_tokens.weight")
+
+ # Transformer layers
+ # -------------------
+ layer_map = _megatron_calc_layer_map(config)
+
+ for layer in range(config.num_hidden_layers):
+ print_rank_0(f"loading layer #{layer}...")
+ layer_name = f"model.layers.{layer}"
+ dst_pp_rank, dst_virtual_pp_rank, dst_layer_idx = layer_map[layer]
+
+ gpt_model_module = _get_gpt_model(models[dst_virtual_pp_rank])
+ sync_layer = gpt_model_module.model.layers[dst_layer_idx]
+
+ _broadcast_tensor(
+ sync_layer.input_layernorm.weight if dst_pp_rank == pp_rank else None,
+ f"{layer_name}.input_layernorm.weight",
+ )
+
+ _broadcast_tp_shard_tensor_qkv(
+ sync_layer.self_attn.qkv_proj.weight if dst_pp_rank == pp_rank else None,
+ f"{layer_name}.self_attn.q_proj.weight",
+ f"{layer_name}.self_attn.k_proj.weight",
+ f"{layer_name}.self_attn.v_proj.weight",
+ )
+
+ _broadcast_tp_shard_tensor(
+ sync_layer.self_attn.o_proj.weight if dst_pp_rank == pp_rank else None,
+ f"{layer_name}.self_attn.o_proj.weight",
+ chunk_dim=1,
+ )
+
+ _broadcast_tensor(
+ sync_layer.post_attention_layernorm.weight if dst_pp_rank == pp_rank else None,
+ f"{layer_name}.post_attention_layernorm.weight",
+ )
+
+ _broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight if dst_pp_rank == pp_rank else None,
+ f"{layer_name}.mlp.gate_proj.weight", f"{layer_name}.mlp.up_proj.weight")
+
+ _broadcast_tp_shard_tensor(
+ sync_layer.mlp.down_proj.weight if dst_pp_rank == pp_rank else None,
+ f"{layer_name}.mlp.down_proj.weight",
+ chunk_dim=1,
+ )
+ # Final Layernorm
+ # -------------------
+ print_rank_0("loading final layernorm...")
+ gpt_model_module = _get_gpt_model(models[-1])
+ _broadcast_tensor(
+ getattr(gpt_model_module.model.norm, "weight", None),
+ "model.norm.weight",
+ )
+
+ print_rank_0("loading lm_head...")
+ lm_head_weight = None
+ if pp_rank + 1 == pp_size:
+ lm_head_weight = gpt_model_module.lm_head.weight
+
+ if is_value_model:
+ # if torch.distributed.get_rank() == 0:
+ if 'lm_head.weight' in state_dict and state_dict['lm_head.weight'].shape[0] == 1:
+ _broadcast_tensor(lm_head_weight, "lm_head.weight")
+ elif 'reward_head.weight' in state_dict and state_dict['reward_head.weight'].shape[0] == 1:
+ _broadcast_tensor(lm_head_weight, "reward_head.weight")
+ print_rank_0('load lm_head from value_head weight')
+ else:
+ _broadcast_tensor(None, "lm_head.weight")
+ print_rank_0('fail to match lm_head in value_model')
+ # else:
+
+ # _broadcast_tensor(lm_head_weight, "lm_head.weight")
+
+ else:
+ _broadcast_tp_shard_tensor(lm_head_weight, "lm_head.weight")
+ dist.barrier()
+ # Broadcast weights inside data parallel groups
+ for wrapped_model in wrapped_models:
+ broadcast_params(wrapped_model)
+
+ torch.cuda.empty_cache()
+ print_rank_0(f"loading megatron ckpt done, time elapsed {time.time() - start_time}s")
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/llama_saver.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/llama_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..0764b6fe5020dc8ab3f69d57af9910e267aab52d
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/checkpoint_utils/llama_saver.py
@@ -0,0 +1,449 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import megatron
+from megatron.core import mpu
+from megatron.utils import print_rank_0, unwrap_model
+from megatron.model import Float16Module
+from megatron.model import DistributedDataParallel as LocalDDP
+from torch.nn.parallel import DistributedDataParallel as torchDDP
+import torch
+import time
+from typing import Optional
+import torch.distributed as dist
+from megatron import get_args
+
+
+def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0):
+ """given TP,DP,PP rank to get the global rank."""
+
+ args = get_args()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+ dp_size = mpu.get_data_parallel_world_size()
+ pp_size = mpu.get_pipeline_model_parallel_world_size()
+ assert (tp_size * dp_size * pp_size == torch.distributed.get_world_size()
+ ), f"{tp_size} x {dp_size} x {pp_size} != {torch.distributed.get_world_size()}"
+ if args.switch_dp_and_pp_grouping:
+ # TP-PP-DP grouping
+ return (dp_rank * pp_size + pp_rank) * tp_size + tp_rank
+ else:
+ # TP-DP-PP grouping
+ return (pp_rank * dp_size + dp_rank) * tp_size + tp_rank
+
+
+def _megatron_calc_layer_map(config):
+ """Calculate the mapping of global layer_idx to local layer_idx
+ Returns:
+ layer_map (Dict: int -> tuple(int, int, int)):
+ mapping from the global layer index to
+ a tuple of (pp_rank, virtual_pp_rank, layer_idx inside model)
+ """
+ import megatron
+ from megatron.core import mpu
+
+ pp_size = mpu.get_pipeline_model_parallel_world_size()
+ virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+
+ args = megatron.get_args()
+ layer_map = dict()
+ num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+ assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+ for pp_rank_idx in range(pp_size):
+ for virtual_pp_rank_idx in range(virtual_pp_size):
+ layer_offset = (virtual_pp_rank_idx * (config.num_hidden_layers // virtual_pp_size) +
+ pp_rank_idx * num_layers_per_model)
+ for layer_idx in range(num_layers_per_model):
+ layer_map[layer_offset + layer_idx] = (
+ pp_rank_idx,
+ virtual_pp_rank_idx,
+ layer_idx,
+ )
+ return layer_map
+
+
+def merge_megatron_ckpt_llama(wrapped_models, config, is_value_model=False, dtype='bf16'):
+ """Merge sharded parameters of a Megatron module into a merged checkpoint.
+
+ Args:
+ wrapped_modelss (list of megatron.model.DistributedDataParallel):
+ The local DDP wrapped megatron modules.
+ dtype (str or None):
+ The data type of state_dict. if None, the data type of the original parameters
+ is used.
+ gpt_model_key: key to access model
+ Returns:
+ state_dict (dict):
+ The merged state_dict in rank 0, and an empty dictionary in other ranks.
+ """
+ start_time = time.time()
+ args = megatron.get_args()
+
+ def _get_gpt_model(model):
+ return model
+
+ dp_rank = mpu.get_data_parallel_rank()
+ pp_size = mpu.get_pipeline_model_parallel_world_size()
+ pp_rank = mpu.get_pipeline_model_parallel_rank()
+ virtual_pp_size = mpu.get_virtual_pipeline_model_parallel_world_size() or 1
+ mp_group = mpu.get_model_parallel_group()
+
+ if dist.get_rank() == 0:
+ assert mp_group.rank() == 0, f"mp_rank:[{mp_group.rank}] != 0 on rank #0"
+ assert pp_rank == 0, f"pp_rank:[{pp_rank}] != 0 on rank #0"
+ assert dp_rank == 0, f"dp_rank:[{dp_rank}] != 0 on rank #0"
+
+ if not isinstance(wrapped_models, (list, tuple)):
+ wrapped_models = list(wrapped_models)
+
+ assert len(wrapped_models) == virtual_pp_size
+ num_layers_per_model = config.num_hidden_layers // pp_size // virtual_pp_size
+ assert num_layers_per_model * pp_size * virtual_pp_size == config.num_hidden_layers
+
+ models = [None] * len(wrapped_models)
+
+ for i, wrapped_model in enumerate(wrapped_models):
+ models[i] = unwrap_model(wrapped_model, (torchDDP, LocalDDP, Float16Module))
+ assert len(models[i].model.layers
+ ) == num_layers_per_model, 'len model layers {} not equal to num_layers_per_model {}'.format(
+ len(models[i].model.layers), num_layers_per_model)
+
+ state_dict = dict()
+
+ def _get_cpu_tensor(tensor: torch.Tensor):
+ if tensor is None:
+ return None
+ if tensor.device == torch.device("cpu"):
+ return tensor.detach().clone()
+ return tensor.detach().cpu()
+
+ def _broadcast_tensor(tensor, name, src_pp_rank) -> torch.Tensor:
+ """broadcast tensor across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+ if torch.distributed.get_rank() == src_rank:
+ if tensor is None:
+ weight = None
+ tensor_shape = None
+ else:
+ weight = tensor
+ tensor_shape = weight.shape
+ else:
+ weight = None
+ tensor_shape = None
+
+ obj_list = [tensor_shape]
+ dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+ tensor_shape = obj_list[0]
+
+ if tensor_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tensor:[{name}] not exist, skip collect")
+ return
+
+ if weight is None:
+ weight = torch.empty(
+ tensor_shape,
+ dtype=args.params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+
+ dist.broadcast(weight, src=src_rank, group=mp_group)
+
+ if torch.distributed.get_rank() == 0:
+ state_dict[name] = _get_cpu_tensor(weight)
+
+ def _broadcast_tp_shard_tensor(tensor, name, src_pp_rank, concat_dim=0, mutate_func=None) -> torch.Tensor:
+ """broadcast tensor in tp shards across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+ src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+ if torch.distributed.get_rank() == src_rank:
+ chunk_shape = tensor.shape
+ else:
+ chunk_shape = None
+
+ obj_list = [chunk_shape]
+ dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+ chunk_shape = obj_list[0]
+ if chunk_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tp_shard tensor:[{name}] not exist, skip collecting")
+ return
+
+ buffer_tensor = torch.empty(
+ chunk_shape,
+ dtype=args.params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+
+ chunk_tensors = [None] * tp_size
+
+ for i in range(tp_size):
+ cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+ sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+ dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+
+ if torch.distributed.get_rank() == 0:
+ chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+
+ if torch.distributed.get_rank() == 0:
+ full_tensor = torch.concat(chunk_tensors, dim=concat_dim)
+ if mutate_func is not None:
+ full_tensor = mutate_func(full_tensor)
+ state_dict[name] = full_tensor
+
+ def _broadcast_tp_shard_tensor_gate_up(tensor, gate_name, up_name, src_pp_rank) -> torch.Tensor:
+ """broadcast tensor in tp shards across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+ src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+ if torch.distributed.get_rank() == src_rank:
+ chunk_shape = tensor.shape
+ else:
+ chunk_shape = None
+
+ obj_list = [chunk_shape]
+ dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+ chunk_shape = obj_list[0]
+ if chunk_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tp_shard tensor:[{gate_name, up_name}] not exist, skip collecting")
+ return
+
+ buffer_tensor = torch.empty(
+ chunk_shape,
+ dtype=args.params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+
+ chunk_tensors = [None] * tp_size
+
+ for i in range(tp_size):
+ cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+ sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+ dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+
+ if torch.distributed.get_rank() == 0:
+ chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+
+ if torch.distributed.get_rank() == 0:
+ full_tensor = torch.concat(chunk_tensors, dim=0)
+ intermediate_size_tp = config.intermediate_size // tp_size
+ gate_weight_list = []
+ up_weight_list = []
+ for i in range(tp_size):
+ gate_up_weight_tp = full_tensor[intermediate_size_tp * 2 * i:intermediate_size_tp * 2 * (i + 1)]
+ gate_weight_tp = gate_up_weight_tp[:intermediate_size_tp]
+ up_weight_tp = gate_up_weight_tp[intermediate_size_tp:]
+ gate_weight_list.append(gate_weight_tp)
+ up_weight_list.append(up_weight_tp)
+
+ state_dict[gate_name] = torch.cat(gate_weight_list, dim=0)
+ state_dict[up_name] = torch.cat(up_weight_list, dim=0)
+
+ def _broadcast_tp_shard_tensor_qkv(tensor, q_name, k_name, v_name, src_pp_rank):
+ """broadcast tensor in tp shards across mp_group"""
+ nonlocal state_dict
+ nonlocal mp_group
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+ src_rank = _megatron_calc_global_rank(tp_rank=0, dp_rank=0, pp_rank=src_pp_rank)
+
+ if torch.distributed.get_rank() == src_rank:
+ chunk_shape = tensor.shape
+ else:
+ chunk_shape = None
+
+ obj_list = [chunk_shape]
+ dist.broadcast_object_list(obj_list, src=src_rank, group=mp_group)
+ chunk_shape = obj_list[0]
+ if chunk_shape is None:
+ # all or none ranks in the mp_group should reach here
+ print_rank_0(f"tp_shard tensor:[{q_name}] not exist, skip collecting")
+ return
+
+ buffer_tensor = torch.empty(
+ chunk_shape,
+ dtype=args.params_dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False,
+ )
+
+ chunk_tensors = [None] * tp_size
+
+ for i in range(tp_size):
+ cur_src_rank = _megatron_calc_global_rank(tp_rank=i, dp_rank=0, pp_rank=src_pp_rank)
+ sync_tensor = tensor if torch.distributed.get_rank() == cur_src_rank else buffer_tensor
+ dist.broadcast(sync_tensor, src=cur_src_rank, group=mp_group)
+
+ if torch.distributed.get_rank() == 0:
+ chunk_tensors[i] = _get_cpu_tensor(sync_tensor)
+
+ if torch.distributed.get_rank() == 0:
+ full_tensor = torch.concat(chunk_tensors, dim=0)
+ q_weight_list = []
+ k_weight_list = []
+ v_weight_list = []
+ hidden_size_per_head = config.hidden_size // config.num_attention_heads
+
+ if config.num_key_value_heads >= tp_size:
+ q_size_tp = config.hidden_size // tp_size
+ kv_size_tp = hidden_size_per_head * config.num_key_value_heads // tp_size
+ total_size = q_size_tp + 2 * kv_size_tp
+ for i in range(tp_size):
+ qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
+ q_part = qkv_part[:q_size_tp]
+ k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
+ v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
+ q_weight_list.append(q_part)
+ k_weight_list.append(k_part)
+ v_weight_list.append(v_part)
+ else:
+ q_size_tp = config.hidden_size // tp_size
+ kv_size_tp = hidden_size_per_head
+ total_size = q_size_tp + 2 * kv_size_tp
+ for i in range(tp_size):
+ qkv_part = full_tensor[i * total_size:(i + 1) * total_size]
+ q_part = qkv_part[:q_size_tp]
+ k_part = qkv_part[q_size_tp:q_size_tp + kv_size_tp]
+ v_part = qkv_part[q_size_tp + kv_size_tp:total_size]
+ q_weight_list.append(q_part)
+ if i * config.num_key_value_heads % tp_size == 0:
+ k_weight_list.append(k_part)
+ v_weight_list.append(v_part)
+
+ state_dict[q_name] = torch.cat(q_weight_list, dim=0)
+ state_dict[k_name] = torch.cat(k_weight_list, dim=0)
+ state_dict[v_name] = torch.cat(v_weight_list, dim=0)
+
+ # empty cache before collecting weights
+ torch.cuda.empty_cache()
+ # Embeddings
+ # -------------------
+ if dp_rank == 0:
+ # Embeddings
+ # -------------------
+ print_rank_0("collecting embeddings...")
+ gpt_model_module = _get_gpt_model(models[0])
+ _broadcast_tp_shard_tensor(
+ gpt_model_module.model.embed_tokens.weight if pp_rank == 0 else None,
+ "model.embed_tokens.weight",
+ src_pp_rank=0,
+ )
+
+ # Transformer layers
+ # -------------------
+ layer_map = _megatron_calc_layer_map(config)
+ for layer in range(config.num_hidden_layers):
+ print_rank_0(f"collecting layer #{layer}...")
+ layer_name = f"model.layers.{layer}"
+ src_pp_rank, src_virtual_pp_rank, src_layer_idx = layer_map[layer]
+
+ gpt_model_module = _get_gpt_model(models[src_virtual_pp_rank])
+ sync_layer = gpt_model_module.model.layers[src_layer_idx]
+
+ _broadcast_tensor(
+ sync_layer.input_layernorm.weight,
+ f"{layer_name}.input_layernorm.weight",
+ src_pp_rank=src_pp_rank,
+ )
+
+ _broadcast_tp_shard_tensor_qkv(
+ sync_layer.self_attn.qkv_proj.weight,
+ f"{layer_name}.self_attn.q_proj.weight",
+ f"{layer_name}.self_attn.k_proj.weight",
+ f"{layer_name}.self_attn.v_proj.weight",
+ src_pp_rank=src_pp_rank,
+ )
+
+ _broadcast_tp_shard_tensor(
+ sync_layer.self_attn.o_proj.weight,
+ f"{layer_name}.self_attn.o_proj.weight",
+ concat_dim=1,
+ src_pp_rank=src_pp_rank,
+ )
+
+ _broadcast_tensor(
+ sync_layer.post_attention_layernorm.weight,
+ f"{layer_name}.post_attention_layernorm.weight",
+ src_pp_rank=src_pp_rank,
+ )
+
+ _broadcast_tp_shard_tensor_gate_up(sync_layer.mlp.gate_up_proj.weight,
+ f"{layer_name}.mlp.gate_proj.weight",
+ f"{layer_name}.mlp.up_proj.weight",
+ src_pp_rank=src_pp_rank)
+
+ _broadcast_tp_shard_tensor(
+ sync_layer.mlp.down_proj.weight,
+ f"{layer_name}.mlp.down_proj.weight",
+ concat_dim=1,
+ src_pp_rank=src_pp_rank,
+ )
+
+ # Final Layernorm
+ # -------------------
+ print_rank_0("collecting final layernorm...")
+ gpt_model_module = _get_gpt_model(models[-1])
+ _broadcast_tensor(
+ getattr(gpt_model_module.model.norm, "weight", None),
+ "model.norm.weight",
+ src_pp_rank=pp_size - 1,
+ )
+
+ print_rank_0("collecting lm_head...")
+
+ if is_value_model:
+ _broadcast_tensor(getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
+ "reward_head.weight",
+ src_pp_rank=pp_size - 1)
+
+ else:
+ _broadcast_tp_shard_tensor(
+ getattr(gpt_model_module.lm_head, "weight", None) if pp_rank == pp_size - 1 else None,
+ "lm_head.weight",
+ src_pp_rank=pp_size - 1,
+ )
+
+ dist.barrier()
+
+ torch.cuda.empty_cache()
+ if torch.distributed.get_rank() == 0:
+ if dtype == "fp16":
+ dtype = torch.float16
+ elif dtype == "bf16":
+ dtype = torch.bfloat16
+ elif dtype is None or dtype == "fp32":
+ dtype = torch.float32
+ else:
+ print(f'Unknown/unsupported dtype to save: {dtype}"')
+ exit(1)
+ for k, v in state_dict.items():
+ if dtype != v.dtype:
+ state_dict[k] = v.to(dtype)
+
+ print_rank_0(f"merge megatron ckpt done, time elapsed {time.time() - start_time}s")
+ return state_dict
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/__init__.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3761bae7db33c29c66534b9ae4f1d8ec8f63b829
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .parallel_attention import ParallelLlamaAttention
+from .parallel_decoder import ParallelLlamaDecoderLayer, ParallelLlamaDecoderLayerRmPad
+from .parallel_mlp import ParallelLlamaMLP
+from .parallel_rmsnorm import ParallelLlamaRMSNorm
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_attention.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..f14653fca4ade888f5ee08e32aa57711c1cf5e73
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_attention.py
@@ -0,0 +1,418 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional, Tuple
+
+import torch
+from megatron.core import parallel_state as mpu
+from megatron.core import tensor_parallel
+from megatron.core import ModelParallelConfig
+from torch import nn
+from transformers import LlamaConfig
+from verl.models.llama.megatron.layers.parallel_linear import QKVParallelLinear
+
+from verl.utils.megatron import tensor_parallel as tp_utils
+
+
+class LlamaRotaryEmbedding(nn.Module):
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(seq_len=max_position_embeddings,
+ device=self.inv_freq.device,
+ dtype=torch.get_default_dtype())
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
+ )
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+ """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+ """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
+ (self.scaling_factor - 1))**(self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., :x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2:]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class ParallelLlamaAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ super().__init__()
+ self.config = config
+ self.megatron_config = megatron_config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+
+ # assign values after tp
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+ assert self.num_heads % tp_size == 0, f'num_head must be divisible by tp_size. Got num_head={self.num_heads}, tp_size={tp_size}'
+ assert self.num_key_value_heads % tp_size == 0, \
+ f'num_key_value_heads must be divisible by tp_size. Got num_key_value_heads={self.num_key_value_heads}, tp_size={tp_size}'
+
+ self.num_heads_per_tp = self.num_heads // tp_size
+ self.num_key_value_heads_per_tp = self.num_key_value_heads // tp_size
+ self.hidden_size_per_tp = self.hidden_size // tp_size
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads}).")
+
+ column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+ row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
+
+ if megatron_config is not None:
+ assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+ assert row_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
+ tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
+
+ # [self.q_size, self.k_size, self.v_size]
+ self.qkv_proj = QKVParallelLinear(input_size=self.hidden_size,
+ num_heads=self.num_heads,
+ num_key_value_heads=self.num_key_value_heads,
+ head_dim=self.head_dim,
+ bias=config.attention_bias,
+ gather_output=False,
+ skip_bias_add=False,
+ **column_kwargs)
+
+ self.q_size = self.num_heads_per_tp * self.head_dim
+ self.k_size = self.num_key_value_heads_per_tp * self.head_dim
+ self.v_size = self.num_key_value_heads_per_tp * self.head_dim
+
+ self.o_proj = tensor_parallel.RowParallelLinear(input_size=self.num_heads * self.head_dim,
+ output_size=self.hidden_size,
+ bias=config.attention_bias,
+ input_is_parallel=True,
+ skip_bias_add=False,
+ **row_kwargs)
+
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = LlamaRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+ qkv = self.qkv_proj(hidden_states)[0]
+ query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads_per_tp, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads_per_tp, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads_per_tp, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads_per_tp, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}")
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}")
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads_per_tp, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads_per_tp, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}")
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size_per_tp)
+ attn_output = self.o_proj(attn_output)[0]
+ return attn_output
+
+
+"""
+Remove padding Attention
+- Using Flash-attn 2
+- Compatible with sequence parallel
+"""
+
+from transformers.utils import is_flash_attn_2_available
+import torch.nn.functional as F
+
+from einops import rearrange
+
+if is_flash_attn_2_available():
+ from flash_attn import flash_attn_varlen_func
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+
+
+def apply_rotary_pos_emb_rmpad(q, k, cos, sin, position_ids, indices, sequence_length):
+ batch_size = position_ids.shape[0]
+
+ q = pad_input(q, indices, batch_size, sequence_length) # (batch_size, seqlen, num_head, head_dim)
+ k = pad_input(k, indices, batch_size, sequence_length)
+ cos = cos[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
+ sin = sin[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+
+ q_embed = index_first_axis(rearrange(q_embed, "b s ... -> (b s) ..."), indices)
+ k_embed = index_first_axis(rearrange(k_embed, "b s ... -> (b s) ..."), indices)
+
+ return q_embed, k_embed
+
+
+from flash_attn.layers.rotary import apply_rotary_emb
+
+
+# use flash-attn rotary embeddings with rmpad
+# cos/sin shoudl be: (seq_length, rotary_dim / 2)
+def apply_rotary_pos_emb_rmpad_flash(q, k, cos, sin, cu_seqlens, max_seqlen):
+ q_embed = apply_rotary_emb(q,
+ cos,
+ sin,
+ interleaved=False,
+ inplace=False,
+ cu_seqlens=cu_seqlens,
+ max_seqlen=max_seqlen)
+ k_embed = apply_rotary_emb(k,
+ cos,
+ sin,
+ interleaved=False,
+ inplace=False,
+ cu_seqlens=cu_seqlens,
+ max_seqlen=max_seqlen)
+ return q_embed, k_embed
+
+
+class ParallelLlamaAttentionRmPad(ParallelLlamaAttention):
+
+ def forward(self,
+ hidden_states: torch.Tensor,
+ position_ids: Optional[torch.LongTensor] = None,
+ sequence_length: int = None,
+ indices: torch.Tensor = None,
+ cu_seqlens: torch.Tensor = None,
+ max_seqlen_in_batch: int = None):
+ total_nnz, _, _ = hidden_states.size() # This is the total_nnz padded after sequence parallel
+
+ if self.megatron_config.sequence_parallel:
+ total_nnz = total_nnz * mpu.get_tensor_model_parallel_world_size()
+
+ qkv = self.qkv_proj(hidden_states)[0]
+ query_states, key_states, value_states = qkv.split([self.q_size, self.k_size, self.v_size],
+ dim=-1) # (total_nnz, 1, hidden_size)
+
+ if self.megatron_config.sequence_parallel:
+ sequence_parallel_pad = total_nnz - cu_seqlens[-1]
+ total_nnz = cu_seqlens[-1] # total_nnz before sp padding
+ query_states = query_states[:total_nnz]
+ key_states = key_states[:total_nnz]
+ value_states = value_states[:total_nnz]
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dime x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(total_nnz, self.num_heads_per_tp, self.head_dim)
+ key_states = key_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
+ value_states = value_states.view(total_nnz, self.num_key_value_heads_per_tp, self.head_dim)
+
+ cos, sin = self.rotary_emb(value_states, seq_len=sequence_length)
+ cos, sin = cos[:, :cos.shape[1] // 2], sin[:, :sin.shape[1] // 2] # flash attn only needs half
+ query_states, key_states = apply_rotary_pos_emb_rmpad_flash(query_states,
+ key_states,
+ cos,
+ sin,
+ cu_seqlens=cu_seqlens,
+ max_seqlen=max_seqlen_in_batch)
+ # query_states, key_states = apply_rotary_pos_emb_rmpad(query_states, key_states, cos, sin, position_ids, indices,
+
+ # TODO: llama does not have dropout in the config??
+ # It is recommended to use dropout with FA according to the docs
+ # when training.
+ dropout_rate = 0.0 # if not self.training else self.attn_dropout
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in float16 just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (LlamaRMSNorm handles it correctly)
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ query_states = query_states.to(torch.float16)
+ key_states = key_states.to(torch.float16)
+ value_states = value_states.to(torch.float16)
+
+ attn_output_unpad = flash_attn_varlen_func(
+ query_states,
+ key_states,
+ value_states,
+ cu_seqlens_q=cu_seqlens,
+ cu_seqlens_k=cu_seqlens,
+ max_seqlen_q=max_seqlen_in_batch,
+ max_seqlen_k=max_seqlen_in_batch,
+ dropout_p=dropout_rate,
+ softmax_scale=None,
+ causal=True,
+ )
+
+ attn_output_unpad = attn_output_unpad.to(input_dtype)
+ attn_output_unpad = attn_output_unpad.reshape(total_nnz, 1, self.hidden_size_per_tp).contiguous()
+
+ # sequence parallel reduce_scatter is performed inside RowColumnParallel if enabled
+ # Here we need to repad
+ if self.megatron_config.sequence_parallel:
+ attn_output_unpad = F.pad(attn_output_unpad, pad=(0, 0, 0, 0, 0, sequence_parallel_pad))
+
+ attn_output_unpad = self.o_proj(attn_output_unpad)[0]
+ return attn_output_unpad
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_decoder.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..93050a37fefb35d8377a1593f0ea3a4e23938a27
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_decoder.py
@@ -0,0 +1,146 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+from megatron.core import ModelParallelConfig
+
+from .parallel_attention import ParallelLlamaAttention, ParallelLlamaAttentionRmPad
+from .parallel_mlp import ParallelLlamaMLP
+from .parallel_rmsnorm import ParallelLlamaRMSNorm
+
+
+class ParallelLlamaDecoderLayer(nn.Module):
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = ParallelLlamaAttention(config=config, megatron_config=megatron_config)
+
+ self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
+ self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+ self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Note: sequence parallel is hidden inside ColumnParallelLinear
+ # reduce scatter is hidden inside RowParallelLinear
+
+ # Self Attention
+ hidden_states = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+
+ # TODO: add sequence parallel operator reduce_scatter here
+
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+
+ # TODO: add sequence parallel operator all_gather here
+
+ hidden_states = self.mlp(hidden_states)
+
+ # TODO: add sequence parallel operator reduce_scatter here
+
+ hidden_states = residual + hidden_states
+
+ outputs = hidden_states
+
+ return outputs
+
+
+class ParallelLlamaDecoderLayerRmPad(nn.Module):
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ super().__init__()
+ self.config = config
+ self.megatron_config = megatron_config
+ self.hidden_size = config.hidden_size
+ self.self_attn = ParallelLlamaAttentionRmPad(config=config, megatron_config=megatron_config)
+
+ self.mlp = ParallelLlamaMLP(config, megatron_config=megatron_config)
+ self.input_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+ self.post_attention_layernorm = ParallelLlamaRMSNorm(config, megatron_config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_ids: Optional[torch.LongTensor] = None,
+ sequence_length: int = None,
+ indices: torch.Tensor = None,
+ cu_seqlens: int = None,
+ max_seqlen_in_batch: int = None
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ residual = hidden_states # (total_nnz // sp, 1, hidden_size)
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ # (total_nnz // sp, 1, hidden_size) -> all-gather (total_nnz, 1, hidden_size)
+ # -> col + row -> reduce-scatter -> (total_nnz // sp, 1, hidden_size)
+ hidden_states = self.self_attn(hidden_states=hidden_states,
+ position_ids=position_ids,
+ sequence_length=sequence_length,
+ indices=indices,
+ cu_seqlens=cu_seqlens,
+ max_seqlen_in_batch=max_seqlen_in_batch)
+
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ # shape changes same as attn
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = hidden_states
+
+ return outputs
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_linear.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfe5cf4e65e4bdd02ebc64ed8f85943b2f6f3a5f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_linear.py
@@ -0,0 +1,74 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/linear.py
+
+from typing import Optional, Tuple
+
+from megatron.core import tensor_parallel
+
+
+class QKVParallelLinear(tensor_parallel.ColumnParallelLinear):
+
+ def __init__(self,
+ input_size,
+ num_heads,
+ num_key_value_heads,
+ head_dim,
+ *,
+ bias=True,
+ gather_output=True,
+ skip_bias_add=False,
+ **kwargs):
+ # Keep input parameters, and already restrict the head numbers
+ self.input_size = input_size
+ self.q_output_size = num_heads * head_dim
+ self.kv_output_size = num_key_value_heads * head_dim
+ self.head_dim = head_dim
+ self.gather_output = gather_output
+ self.skip_bias_add = skip_bias_add
+
+ input_size = self.input_size
+ output_size = (num_heads + 2 * num_key_value_heads) * self.head_dim
+
+ super().__init__(input_size=input_size,
+ output_size=output_size,
+ bias=bias,
+ gather_output=gather_output,
+ skip_bias_add=skip_bias_add,
+ **kwargs)
+
+
+class MergedColumnParallelLinear(tensor_parallel.ColumnParallelLinear):
+
+ def __init__(self,
+ input_size,
+ gate_ouput_size,
+ up_output_size,
+ *,
+ bias=True,
+ gather_output=True,
+ skip_bias_add=False,
+ **kwargs):
+ # Keep input parameters, and already restrict the head numbers
+ self.input_size = input_size
+ self.output_size = gate_ouput_size + up_output_size
+ self.gather_output = gather_output
+ self.skip_bias_add = skip_bias_add
+
+ super().__init__(input_size=self.input_size,
+ output_size=self.output_size,
+ bias=bias,
+ gather_output=gather_output,
+ skip_bias_add=skip_bias_add,
+ **kwargs)
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_mlp.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ad9b16a642655dd593ce4d1e5fafb31d81c435
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_mlp.py
@@ -0,0 +1,74 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core import parallel_state as mpu
+from megatron.core import tensor_parallel
+from megatron.core import ModelParallelConfig
+from torch import nn
+from transformers.activations import ACT2FN
+from verl.models.llama.megatron.layers.parallel_linear import MergedColumnParallelLinear
+
+from verl.utils.megatron import tensor_parallel as tp_utils
+
+
+class ParallelLlamaMLP(nn.Module):
+
+ def __init__(self, config, megatron_config: ModelParallelConfig = None) -> None:
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ # The weight is only [hidden_size, intermediate_size // model_parallel_world_size]
+
+ column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+ row_kwargs = tp_utils.get_default_kwargs_for_row_parallel_linear()
+
+ if megatron_config is not None:
+ assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+ assert row_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(row_kwargs, megatron_config)
+ tp_utils.update_kwargs_with_config(column_kwargs, megatron_config)
+
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+
+ self.gate_up_proj = MergedColumnParallelLinear(
+ input_size=self.hidden_size,
+ gate_ouput_size=self.intermediate_size,
+ up_output_size=self.intermediate_size,
+ bias=False,
+ gather_output=False,
+ skip_bias_add=False,
+ **column_kwargs,
+ )
+ self.gate_size = self.intermediate_size // tp_size
+
+ self.down_proj = tensor_parallel.RowParallelLinear(input_size=self.intermediate_size,
+ output_size=self.hidden_size,
+ bias=False,
+ input_is_parallel=True,
+ skip_bias_add=False,
+ **row_kwargs)
+
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ gate_up = self.gate_up_proj(x)[0]
+ gate, up = gate_up.split(self.gate_size, dim=-1)
+ return self.down_proj(self.act_fn(gate) * up)[0]
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_rmsnorm.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_rmsnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7027036bf48d47a7f983226e9308336f85ad0461
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/layers/parallel_rmsnorm.py
@@ -0,0 +1,46 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+import torch
+from megatron.core import ModelParallelConfig
+from torch import nn
+from transformers import LlamaConfig
+
+from apex.normalization.fused_layer_norm import fused_rms_norm_affine
+from verl.utils.megatron import sequence_parallel as sp_utils
+
+
+class ParallelLlamaRMSNorm(nn.Module):
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ """
+ LlamaRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ if isinstance(config.hidden_size, numbers.Integral):
+ normalized_shape = (config.hidden_size,)
+ self.normalized_shape = torch.Size(normalized_shape)
+ self.weight = nn.Parameter(torch.ones(self.normalized_shape))
+ self.variance_epsilon = config.rms_norm_eps
+
+ if megatron_config.sequence_parallel:
+ sp_utils.mark_parameter_as_sequence_parallel(self.weight)
+
+ def forward(self, hidden_states):
+ return fused_rms_norm_affine(input=hidden_states,
+ weight=self.weight,
+ normalized_shape=self.normalized_shape,
+ eps=self.variance_epsilon,
+ memory_efficient=True)
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/models/llama/megatron/modeling_llama_megatron.py b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/modeling_llama_megatron.py
new file mode 100644
index 0000000000000000000000000000000000000000..c693f33c5872e341368aad4ee4b0f2b99ed5f5cd
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/llama/megatron/modeling_llama_megatron.py
@@ -0,0 +1,656 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LLaMA model with Megatron-style acceleration."""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from megatron.core import tensor_parallel
+from megatron.core import ModelParallelConfig
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import CausalLMOutputWithPast
+
+from verl.utils.megatron import sequence_parallel as sp_utils
+from verl.utils.megatron import tensor_parallel as tp_utils
+from .layers import ParallelLlamaDecoderLayer, ParallelLlamaRMSNorm, ParallelLlamaDecoderLayerRmPad
+"""
+TODO:
+1. Add weight initialization. Here we need to be careful on TP weight init.
+2. Add sequence parallel
+3. Load checkpoint from meta LLama pretrained checkpoint
+"""
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class ParallelLlamaModel(nn.Module):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+ Args:
+ config: LlamaConfig
+ """
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ super().__init__()
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
+ if megatron_config is not None:
+ assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
+ self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
+ embedding_dim=config.hidden_size,
+ **embedding_kwargs)
+
+ self.layers = nn.ModuleList(
+ [ParallelLlamaDecoderLayer(config, megatron_config) for _ in range(config.num_hidden_layers)])
+ self.norm = ParallelLlamaRMSNorm(config, megatron_config)
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype,
+ tgt_len=input_shape[-1]).to(inputs_embeds.device)
+ combined_attention_mask = (expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask +
+ combined_attention_mask)
+
+ return combined_attention_mask
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ """
+
+ Args:
+ input_ids: input ids. shape (batch_size, seq_length)
+ attention_mask: attention_mask. shape (batch_size, seq_length)
+ position_ids: position ids. shape (batch_size, seq_length)
+
+ Returns:
+
+ """
+ batch_size, seq_length = input_ids.shape
+ inputs_embeds = self.embed_tokens(input_ids)
+ # embed positions
+
+ attention_mask = self._prepare_decoder_attention_mask(attention_mask, (batch_size, seq_length), inputs_embeds)
+
+ hidden_states = inputs_embeds
+
+ for idx, decoder_layer in enumerate(self.layers):
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+
+ hidden_states = layer_outputs
+
+ hidden_states = self.norm(hidden_states)
+
+ return hidden_states
+
+
+class ParallelLlamaForCausalLM(nn.Module):
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ super().__init__()
+ self.model = ParallelLlamaModel(config, megatron_config=megatron_config)
+ self.vocab_size = config.vocab_size
+
+ column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+ if megatron_config is not None:
+ assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+
+ self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=config.hidden_size,
+ output_size=config.vocab_size,
+ bias=False,
+ gather_output=False,
+ skip_bias_add=False,
+ **column_kwargs)
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+ ```"""
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+
+ hidden_states = outputs
+ logits = self.lm_head(hidden_states)[0]
+
+ logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+
+ logits = logits.float()
+ return CausalLMOutputWithPast(
+ loss=None,
+ logits=logits,
+ past_key_values=None,
+ hidden_states=None,
+ attentions=None,
+ )
+
+
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+
+
+class ParallelLlamaModelRmPad(nn.Module):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+ Args:
+ config: LlamaConfig
+ """
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ super().__init__()
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
+ self.megatron_config = megatron_config
+ if megatron_config is not None:
+ assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
+ self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
+ embedding_dim=config.hidden_size,
+ **embedding_kwargs)
+
+ self.layers = nn.ModuleList(
+ [ParallelLlamaDecoderLayerRmPad(config, megatron_config) for _ in range(config.num_hidden_layers)])
+ self.norm = ParallelLlamaRMSNorm(config, megatron_config)
+
+ def forward(self,
+ input_ids: torch.Tensor,
+ position_ids: Optional[torch.LongTensor] = None,
+ sequence_length: int = None,
+ indices: torch.Tensor = None,
+ cu_seqlens: int = None,
+ max_seqlen_in_batch: int = None) -> Union[Tuple, BaseModelOutputWithPast]:
+ """
+
+ Args:
+ input_ids: input ids. shape (1, totol_nnz)
+ position_ids: position ids. shape (batch_size, seq_length)
+
+ Returns:
+
+ """
+ inputs_embeds = self.embed_tokens(input_ids) # (1, total_nnz) -> (1, total_nnz, hidden_size)
+
+ # (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
+ inputs_embeds = inputs_embeds.transpose(0, 1)
+ if self.megatron_config.sequence_parallel:
+ inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
+
+ hidden_states = inputs_embeds
+ for idx, decoder_layer in enumerate(self.layers):
+ layer_outputs = decoder_layer(hidden_states,
+ position_ids=position_ids,
+ sequence_length=sequence_length,
+ indices=indices,
+ cu_seqlens=cu_seqlens,
+ max_seqlen_in_batch=max_seqlen_in_batch)
+
+ hidden_states = layer_outputs
+
+ hidden_states = self.norm(hidden_states)
+
+ return hidden_states
+
+
+class ParallelLlamaForCausalLMRmPad(nn.Module):
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
+ super().__init__()
+ self.config = config
+ self.megatron_config = megatron_config
+ self.model = ParallelLlamaModelRmPad(config, megatron_config=megatron_config)
+ self.vocab_size = config.vocab_size
+ self._init_head()
+
+ def _init_head(self):
+ column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+ if self.megatron_config is not None:
+ assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+ self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=self.config.hidden_size,
+ output_size=self.config.vocab_size,
+ bias=False,
+ gather_output=False,
+ skip_bias_add=False,
+ **column_kwargs)
+
+ def _forward_head(self, hidden_states):
+ # all_gather from sequence parallel region is performed inside lm_head
+ logits = self.lm_head(hidden_states)[0]
+ logits = logits.float() # (total_nnz_padded, 1, vocab_size // tp)
+ logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits) # (total_nnz_padded, 1, vocab_size)
+ return logits
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+ ```"""
+ batch_size, sequence_length = input_ids.shape
+
+ # remove padding here
+ input_ids, indices, cu_seqlens, max_seqlen_in_batch, *_ = unpad_input(input_ids.unsqueeze(dim=-1),
+ attention_mask) # (total_nnz, 1)
+
+ # pad input_ids to multiple of tp for all tp ranks
+ # TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
+ if self.megatron_config.sequence_parallel:
+ input_ids = sp_utils.pad_to_sequence_parallel(input_ids)
+
+ input_ids = input_ids.transpose(0, 1) # (1, total_nnz+pad)
+
+ outputs = self.model(input_ids=input_ids,
+ position_ids=position_ids,
+ sequence_length=sequence_length,
+ indices=indices,
+ cu_seqlens=cu_seqlens,
+ max_seqlen_in_batch=max_seqlen_in_batch)
+
+ hidden_states = outputs
+
+ logits = self._forward_head(hidden_states)
+
+ # remove padding from sequence parallel
+ if self.megatron_config.sequence_parallel:
+ totol_nnz = cu_seqlens[-1]
+ logits = logits[:totol_nnz] # (total_nnz_padded)
+
+ logits = torch.squeeze(logits, dim=1) # remove the artificial batch dimension
+ # add removed padding back
+ logits = pad_input(logits, indices, batch_size,
+ seqlen=sequence_length) # (batch_size, sequence_length, vocab_size)
+
+ return CausalLMOutputWithPast(
+ loss=None,
+ logits=logits,
+ past_key_values=None,
+ hidden_states=None,
+ attentions=None,
+ )
+
+
+class ParallelLlamaForValueRmPad(ParallelLlamaForCausalLMRmPad):
+
+ def _init_head(self):
+ column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+ if self.megatron_config is not None:
+ assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+ self.lm_head = nn.Linear(in_features=self.config.hidden_size, out_features=1, bias=False)
+ # lm_head is effectively the same as sequence parallel
+ sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
+
+ def _forward_head(self, hidden_states):
+ logits = self.lm_head(hidden_states) # (total_nnz_padded // tp, 1, 1)
+ logits = logits.float()
+ if self.megatron_config.sequence_parallel:
+ logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
+ return logits
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ output = super().forward(input_ids, attention_mask, position_ids)
+ output.logits = torch.squeeze(output.logits, dim=-1)
+ return output
+
+
+"""
+Support pipeline parallelism
+"""
+
+
+class ParallelLlamaModelRmPadPP(nn.Module):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+ This model definition supports pipeline parallelism. To support pp and vpp,
+ - This model only contains layer in this pp stage and vpp chunk
+ - When calling get_model in Megatron, this rank will instantiate all the vpp chunks in this pp.
+ Args:
+ config: LlamaConfig
+ """
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, pre_process, post_process):
+ super().__init__()
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.pre_process = pre_process
+ self.post_process = post_process
+ self.megatron_config = megatron_config
+ embedding_kwargs = tp_utils.get_default_kwargs_for_parallel_embedding()
+ if megatron_config is not None:
+ assert embedding_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(embedding_kwargs, self.megatron_config)
+ if pre_process:
+ self.embed_tokens = tensor_parallel.VocabParallelEmbedding(num_embeddings=config.vocab_size,
+ embedding_dim=config.hidden_size,
+ **embedding_kwargs)
+ else:
+ self.embed_tokens = None
+
+ # pp_rank = megatron_config.pipeline_model_parallel_rank
+ pp_size = megatron_config.pipeline_model_parallel_size
+ self.num_layer_per_pp = config.num_hidden_layers // pp_size
+ vpp_size = megatron_config.virtual_pipeline_model_parallel_size
+
+ if vpp_size is not None:
+ self.num_layer_vpp_chunk = self.num_layer_per_pp // vpp_size
+ self.num_layer_this_model = self.num_layer_vpp_chunk
+ # vpp_rank = megatron_config.virtual_pipeline_model_parallel_rank
+ # self.offset = vpp_rank * (
+ # config.num_hidden_layers // megatron_config.virtual_pipeline_model_parallel_size) + \
+ # (megatron_config.pipeline_model_parallel_rank * self.num_layer_vpp_chunk)
+ else:
+ self.num_layer_this_model = self.num_layer_per_pp
+ # self.offset = pp_rank * self.num_layer_per_pp
+
+ layers = []
+ for i in range(self.num_layer_this_model):
+ layer = ParallelLlamaDecoderLayerRmPad(config, megatron_config)
+ # setattr(layer, 'hidden_layer_index', self.offset + i)
+ layers.append(layer)
+
+ self.layers = nn.ModuleList(layers)
+
+ if post_process:
+ self.norm = ParallelLlamaRMSNorm(config, megatron_config)
+ else:
+ self.norm = None
+
+ def set_input_tensor(self, input_tensor):
+ """Set input tensor to be used instead of forward()'s input.
+
+ When doing pipeline parallelism the input from the previous
+ stage comes from communication, not from the input, so the
+ model's forward_step_func won't have it. This function is thus
+ used by internal code to bypass the input provided by the
+ forward_step_func"""
+ self.input_tensor = input_tensor
+
+ def forward(self,
+ input_ids: torch.Tensor,
+ position_ids: Optional[torch.LongTensor] = None,
+ sequence_length: int = None,
+ indices: torch.Tensor = None,
+ cu_seqlens: int = None,
+ max_seqlen_in_batch: int = None) -> Union[Tuple, BaseModelOutputWithPast]:
+ """
+
+ Args:
+ input_ids: input ids. shape (1, totol_nnz)
+ position_ids: position ids. shape (batch_size, seq_length)
+
+ Returns:
+
+ """
+ if self.pre_process:
+ inputs_embeds = self.embed_tokens(input_ids) # (1, total_nnz) -> (1, total_nnz, hidden_size)
+
+ # vocab parallel embedding will not do sequence parallel reduce-scatter in open source megatron
+ # so need to deal with it by handle here:
+ # (1, total_nnz, hidden_size) -> (total_nnz, 1, hidden_size) -> (total_nnz // sp, 1, hidden_size)
+ inputs_embeds = inputs_embeds.transpose(0, 1)
+ if self.megatron_config.sequence_parallel:
+ inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
+
+ hidden_states = inputs_embeds
+ else:
+ # self.hidden_states should be passed by Megatron
+ hidden_states = self.input_tensor
+
+ for idx, decoder_layer in enumerate(self.layers):
+ layer_outputs = decoder_layer(hidden_states,
+ position_ids=position_ids,
+ sequence_length=sequence_length,
+ indices=indices,
+ cu_seqlens=cu_seqlens,
+ max_seqlen_in_batch=max_seqlen_in_batch)
+
+ hidden_states = layer_outputs
+
+ if self.post_process:
+ hidden_states = self.norm(hidden_states)
+
+ return hidden_states
+
+
+class ParallelLlamaForCausalLMRmPadPP(nn.Module):
+
+ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig, pre_process, post_process):
+ super().__init__()
+ self.config = config
+ self.megatron_config = megatron_config
+ self.model = ParallelLlamaModelRmPadPP(config,
+ megatron_config=megatron_config,
+ pre_process=pre_process,
+ post_process=post_process)
+ self.share_embeddings_and_output_weights = None # workaround, megatron requires this attr
+ self.vocab_size = config.vocab_size
+ self.pre_process = pre_process
+ self.post_process = post_process
+ if post_process:
+ self._init_head()
+
+ def set_input_tensor(self, input_tensor):
+ """Set input tensor to be used instead of forward()'s input.
+
+ When doing pipeline parallelism the input from the previous
+ stage comes from communication, not from the input, so the
+ model's forward_step_func won't have it. This function is thus
+ used by internal code to bypass the input provided by the
+ forward_step_func"""
+ assert len(input_tensor) == 1
+ self.model.set_input_tensor(input_tensor[0])
+
+ def _init_head(self):
+ column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+ if self.megatron_config is not None:
+ assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+ self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=self.config.hidden_size,
+ output_size=self.config.vocab_size,
+ bias=False,
+ gather_output=False,
+ skip_bias_add=False,
+ **column_kwargs)
+
+ def _forward_head(self, hidden_states):
+ # all_gather from sequence parallel region is performed inside lm_head
+ # logits shape before forward_head hidden_states.shape: [4, 32, 4096]
+ logits = self.lm_head(hidden_states)[0]
+ # logits shape after forward_head logits.shape: [8, 32, 8]
+ logits = logits.float() # (total_nnz_padded, 1, vocab_size // tp)
+ return logits
+
+ def forward(
+ self,
+ # original input
+ *,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+ Returns:
+ ```"""
+
+ # Note that input_ids, attention_mask and position_ids should be passed to every pp layer.
+ # In the first pp, input_ids will be used, in other pp layers hidden_states will be used inside self.model
+ batch_size, sequence_length = input_ids.shape
+ # remove padding here
+ input_ids_rmpad, indices, cu_seqlens, max_seqlen_in_batch, *_ = unpad_input(input_ids.unsqueeze(dim=-1),
+ attention_mask) # (total_nnz, 1)
+
+ # pad input_ids to multiple of tp for all tp ranks
+ # TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
+ if self.megatron_config.sequence_parallel:
+ input_ids_rmpad = sp_utils.pad_to_sequence_parallel(input_ids_rmpad)
+
+ input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz+pad)
+
+ outputs = self.model(input_ids=input_ids_rmpad,
+ position_ids=position_ids,
+ sequence_length=sequence_length,
+ indices=indices,
+ cu_seqlens=cu_seqlens,
+ max_seqlen_in_batch=max_seqlen_in_batch)
+
+ if self.post_process:
+ hidden_states = outputs
+ # print(f'hidden_states.shape = {hidden_states.shape}') # torch.Size([4, 32, 4096])
+ logits = self._forward_head(hidden_states)
+ logits = torch.squeeze(logits, dim=1) # remove the artificial batch dimension # torch.Size([8, 32, 16])
+
+ # remove padding from sequence parallel
+ if self.megatron_config.sequence_parallel:
+ totol_nnz = cu_seqlens[-1]
+ logits = logits[:totol_nnz] # (total_nnz_padded)
+ # add removed padding back. If input is already rmpad, we let the caller pad_input
+ logits = pad_input(logits, indices, batch_size,
+ seqlen=sequence_length) # (batch_size, sequence_length, vocab_size)
+
+ return CausalLMOutputWithPast(
+ loss=None,
+ logits=logits,
+ past_key_values=None,
+ hidden_states=None,
+ attentions=None,
+ )
+ else:
+ return outputs
+
+
+class ParallelLlamaForValueRmPadPP(ParallelLlamaForCausalLMRmPadPP):
+
+ def _init_head(self):
+ column_kwargs = tp_utils.get_default_kwargs_for_column_parallel_linear()
+ if self.megatron_config is not None:
+ assert column_kwargs.get('config', False), 'must have ModelParallelConfig'
+ tp_utils.update_kwargs_with_config(column_kwargs, self.megatron_config)
+ self.lm_head = nn.Linear(in_features=self.config.hidden_size, out_features=1, bias=False)
+ # lm_head is effectively the same as sequence parallel
+ sp_utils.mark_parameter_as_sequence_parallel(self.lm_head.weight)
+
+ def _forward_head(self, hidden_states):
+ logits = self.lm_head(hidden_states) # (total_nnz_padded // tp, 1, 1)
+ logits = logits.float()
+ if self.megatron_config.sequence_parallel:
+ logits = tensor_parallel.gather_from_sequence_parallel_region(logits, tensor_parallel_output_grad=False)
+ return logits
+
+ def forward(
+ self,
+ *,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ output = super().forward(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
+ if self.post_process:
+ output.logits = torch.squeeze(output.logits, dim=-1)
+ return output
+ else:
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/models/registry.py b/code/RL_model/verl/Search-R1/verl/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..55ddbd4493d3287511fcaca1c215a22d8930b1a1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/registry.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+from typing import List, Optional, Type
+
+import torch.nn as nn
+
+# Supported models using HF Rmpad
+# TODO(sgm): HF may supported more than listed here, we should add more after testing
+from transformers import LlamaConfig, MistralConfig, GemmaConfig, Qwen2Config
+
+_REOVEPAD_MODELS = {'llama': LlamaConfig, 'mistral': MistralConfig, 'gemma': GemmaConfig, 'qwen2': Qwen2Config}
+
+
+def check_model_support_rmpad(model_type: str):
+ assert isinstance(model_type, str)
+ if not model_type in _REOVEPAD_MODELS.keys():
+ raise ValueError(f"Model architecture {model_type} is not supported for now. "
+ f"RMPad supported architectures: {_REOVEPAD_MODELS.keys()}."
+ f"Please set `use_remove_padding=False` in the model config.")
+
+
+# Supported models in Megatron-LM
+# Architecture -> (module, class).
+_MODELS = {
+ "LlamaForCausalLM":
+ ("llama", ("ParallelLlamaForCausalLMRmPadPP", "ParallelLlamaForValueRmPadPP", "ParallelLlamaForCausalLMRmPad")),
+ "MistralForCausalLM": ("mistral", ("ParallelMistralForCausalLMRmPadPP", "ParallelMistralForValueRmPadPP",
+ "ParallelMistralForCausalLMRmPad"))
+}
+
+
+# return model class
+class ModelRegistry:
+
+ @staticmethod
+ def load_model_cls(model_arch: str, value=False) -> Optional[Type[nn.Module]]:
+ if model_arch not in _MODELS:
+ return None
+
+ megatron = "megatron"
+
+ module_name, model_cls_name = _MODELS[model_arch]
+ if not value: # actor/ref
+ model_cls_name = model_cls_name[0]
+ elif value: # critic/rm
+ model_cls_name = model_cls_name[1]
+
+ module = importlib.import_module(f"verl.models.{module_name}.{megatron}.modeling_{module_name}_megatron")
+ return getattr(module, model_cls_name, None)
+
+ @staticmethod
+ def get_supported_archs() -> List[str]:
+ return list(_MODELS.keys())
diff --git a/code/RL_model/verl/Search-R1/verl/models/transformers/__init__.py b/code/RL_model/verl/Search-R1/verl/models/transformers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/transformers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/models/transformers/llama.py b/code/RL_model/verl/Search-R1/verl/models/transformers/llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e8a5b1906474435c235320d119dd1a7f9c61fa5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/transformers/llama.py
@@ -0,0 +1,145 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+from typing import Optional, List, Union, Tuple, Unpack, Callable
+
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+from transformers.cache_utils import Cache
+from transformers.utils import logging
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from verl.utils.ulysses import gather_heads_scatter_seq, gather_seq_scatter_heads, get_ulysses_sequence_parallel_world_size
+
+logger = logging.get_logger(__name__)
+
+def llama_flash_attn_forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """
+ adapt from transformers 4.47.1
+ """
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dim x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ # trade off: repeat first and then all to all
+ # key_states = repeat_kv(key_states, self.num_key_value_groups)
+ # value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ ########## AlltoAll for Ulysses ##########
+ ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
+
+ if ulysses_sp_size > 1:
+ # (bsz, n_head, seq_len/n, head_dim) -> (bsz, n_head/n, seq_len, head_dim)
+ query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
+ key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
+ value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
+
+ full_q_len = query_states.size(2) # full seq length
+
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory.")
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+ # to be able to avoid many of these transpose/reshape/view.
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ dropout_rate = self.attention_dropout if self.training else 0.0
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (LlamaRMSNorm handles it correctly)
+
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.q_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}.")
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = _flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ full_q_len,
+ position_ids=position_ids,
+ dropout=dropout_rate,
+ sliding_window=getattr(self, "sliding_window", None),
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
+ is_causal=self.is_causal,
+ **kwargs,
+ )
+
+ attn_output = attn_output.reshape(bsz, full_q_len, -1, self.head_dim).contiguous()
+ ########## AlltoAll for Ulysses ##########
+ if ulysses_sp_size > 1:
+ attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
diff --git a/code/RL_model/verl/Search-R1/verl/models/transformers/monkey_patch.py b/code/RL_model/verl/Search-R1/verl/models/transformers/monkey_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a11148b4d0ed565d5a9a5b43babe47789a9ce726
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/transformers/monkey_patch.py
@@ -0,0 +1,74 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Apply monkey-patch function to models
+"""
+
+#### Open Source Models
+#### transformers version < 4.48
+
+
+def apply_monkey_patch_to_llama():
+ from transformers.models.llama.modeling_llama import LlamaFlashAttention2
+ from verl.models.transformers.llama import llama_flash_attn_forward
+ LlamaFlashAttention2.forward = llama_flash_attn_forward
+
+
+def apply_monkey_patch_to_qwen2():
+ from transformers.models.qwen2.modeling_qwen2 import Qwen2FlashAttention2
+ from verl.models.transformers.qwen2 import qwen2_flash_attn_forward
+ Qwen2FlashAttention2.forward = qwen2_flash_attn_forward
+
+
+_PATCH_NAME_TO_FUNC = {
+ 'llama': apply_monkey_patch_to_llama,
+ 'qwen2': apply_monkey_patch_to_qwen2,
+}
+
+from transformers import PretrainedConfig
+
+
+def apply_monkey_patch(config: PretrainedConfig, verbose=True):
+ if not is_transformers_version_in_range("4.45.0", "4.47.1"):
+ raise AssertionError("The installed `transformers` version doesn't support ulysses patch. "
+ "Please install a version between 4.45.0 and 4.47.1 to use this ulysses feature.")
+ success_apply_monkey_patch = False
+ if config.model_type in _PATCH_NAME_TO_FUNC:
+ _PATCH_NAME_TO_FUNC[config.model_type]()
+ success_apply_monkey_patch = True
+
+ if success_apply_monkey_patch and verbose:
+ print(f'Applying monkey patch to model {config.model_type}')
+ elif not success_apply_monkey_patch:
+ raise NotImplementedError(f'Ulysses for model {config.model_type} is not implemented, \
+ please set `ulysses_sequence_parallel_size=1`')
+
+ return success_apply_monkey_patch
+
+
+from functools import lru_cache
+from packaging import version
+import importlib.metadata
+
+
+@lru_cache()
+def is_transformers_version_in_range(min_version: str, max_version: str) -> bool:
+ try:
+ # Get the installed version of the transformers library
+ transformers_version = importlib.metadata.version("transformers")
+ except importlib.metadata.PackageNotFoundError:
+ raise ModuleNotFoundError("The `transformers` package is not installed.")
+
+ # Check if the version is within the specified range
+ return version.parse(min_version) <= version.parse(transformers_version) <= version.parse(max_version)
diff --git a/code/RL_model/verl/Search-R1/verl/models/transformers/qwen2.py b/code/RL_model/verl/Search-R1/verl/models/transformers/qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b267b8436b9e70cd9ea32f046dfdab71a4ce7565
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/transformers/qwen2.py
@@ -0,0 +1,137 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+from typing import Optional, Tuple
+
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+from transformers.cache_utils import Cache
+from transformers.utils import logging
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from verl.utils.ulysses import gather_heads_scatter_seq, gather_seq_scatter_heads, get_ulysses_sequence_parallel_world_size
+
+logger = logging.get_logger(__name__)
+
+
+def qwen2_flash_attn_forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
+):
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+ ########## AlltoAll for Ulysses ##########
+ ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
+
+ if ulysses_sp_size > 1:
+ # (bsz, n_head, seq_len/n, head_dim) -> (bsz, n_head/n, seq_len, head_dim)
+ query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
+ key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
+ value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
+
+ full_q_len = query_states.size(2) # full seq length
+
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory.")
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in float16 just to be sure everything works as expected.
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.q_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}.")
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ # Reashape to the expected shape for Flash Attention
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ if (self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None and
+ self.layer_idx >= self.config.max_window_layers):
+ sliding_window = self.config.sliding_window
+ else:
+ sliding_window = None
+
+ attn_output = _flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ full_q_len,
+ position_ids=position_ids,
+ dropout=dropout_rate,
+ sliding_window=sliding_window,
+ is_causal=self.is_causal,
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
+ )
+
+ # use full_q_len to reshape
+ attn_output = attn_output.reshape(bsz, full_q_len, -1, self.head_dim).contiguous()
+ ########## AlltoAll for Ulysses ##########
+ if ulysses_sp_size > 1:
+ attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
diff --git a/code/RL_model/verl/Search-R1/verl/models/weight_loader_registry.py b/code/RL_model/verl/Search-R1/verl/models/weight_loader_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..17f0c5cae957d6bd665fd0f9dcdc84c1206adfa8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/models/weight_loader_registry.py
@@ -0,0 +1,23 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def get_weight_loader(arch: str):
+ from verl.models.llama.megatron.checkpoint_utils.llama_loader import load_state_dict_to_megatron_llama
+ _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY = {'LlamaForCausalLM': load_state_dict_to_megatron_llama}
+
+ if arch in _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY:
+ return _MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {_MODEL_WEIGHT_MEGATRON_LOADER_REGISTRY.keys()}")
diff --git a/code/RL_model/verl/Search-R1/verl/protocol.py b/code/RL_model/verl/Search-R1/verl/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..803da36643a70a69f08541d74e2782ad72db32a9
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/protocol.py
@@ -0,0 +1,639 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implement base data transfer protocol between any two functions, modules.
+We can subclass Protocol to define more detailed batch info with specific keys
+"""
+
+import pickle
+import numpy as np
+import copy
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Union
+
+import torch
+import tensordict
+from tensordict import TensorDict
+from torch.utils.data import DataLoader, Dataset
+
+from verl.utils.py_functional import union_two_dict
+
+__all__ = ['DataProto', 'union_tensor_dict']
+
+try:
+ tensordict.set_lazy_legacy(False).set()
+except:
+ pass
+
+
+def pad_dataproto_to_divisor(data: 'DataProto', size_divisor: int):
+ """Pad a DataProto to size divisible by size_divisor
+
+ Args:
+ size_divisor (int): size divisor
+
+ Returns:
+ data: (DataProto): the padded DataProto
+ pad_size (int)
+ """
+ assert isinstance(data, DataProto), 'data must be a DataProto'
+ if len(data) % size_divisor != 0:
+ pad_size = size_divisor - len(data) % size_divisor
+ data_padded = DataProto.concat([data, data[:pad_size]])
+ else:
+ pad_size = 0
+ data_padded = data
+ return data_padded, pad_size
+
+
+def unpad_dataproto(data: 'DataProto', pad_size):
+ if pad_size != 0:
+ data = data[:-pad_size]
+ return data
+
+
+def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> TensorDict:
+ """Union two tensordicts."""
+ assert tensor_dict1.batch_size == tensor_dict2.batch_size, \
+ f'Two tensor dict must have identical batch size. Got {tensor_dict1.batch_size} and {tensor_dict2.batch_size}'
+ for key in tensor_dict2.keys():
+ if key not in tensor_dict1.keys():
+ tensor_dict1[key] = tensor_dict2[key]
+ else:
+ assert tensor_dict1[key].equal(tensor_dict2[key]), \
+ f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
+
+ return tensor_dict1
+
+
+def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[np.ndarray]) -> dict[np.ndarray]:
+ for key, val in tensor_dict2.items():
+ if key in tensor_dict1:
+ assert isinstance(tensor_dict2[key], np.ndarray)
+ assert isinstance(tensor_dict1[key], np.ndarray)
+ assert np.all(tensor_dict2[key] == tensor_dict1[key]), \
+ f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
+ tensor_dict1[key] = val
+
+ return tensor_dict1
+
+
+def list_of_dict_to_dict_of_list(list_of_dict: list[dict]):
+ if len(list_of_dict) == 0:
+ return {}
+ keys = list_of_dict[0].keys()
+ output = {key: [] for key in keys}
+ for data in list_of_dict:
+ for key, item in data.items():
+ assert key in output
+ output[key].append(item)
+ return output
+
+
+def fold_batch_dim(data: 'DataProto', new_batch_size):
+ """
+ Fold a batch dim from [bsz, xxx] into [new_bsz, bsz // new_bsz, xxx]
+ """
+ batch_size = data.batch.batch_size[0]
+
+ assert batch_size % new_batch_size == 0
+
+ tensor: TensorDict = data.batch
+ non_tensor = data.non_tensor_batch
+
+ tensor = tensor.view(new_batch_size, -1)
+ tensor.auto_batch_size_(batch_dims=1)
+
+ for key, val in non_tensor.items():
+ non_tensor[key] = np.reshape(val, newshape=(new_batch_size, -1, *val.shape[1:]))
+
+ return DataProto(batch=tensor, non_tensor_batch=non_tensor, meta_info=data.meta_info)
+
+
+def unfold_batch_dim(data: 'DataProto', batch_dims=2):
+ """
+ Unfold the first n dims as new batch dim
+ """
+ tensor: TensorDict = data.batch
+ non_tensor = data.non_tensor_batch
+ tensor.auto_batch_size_(batch_dims=batch_dims)
+ tensor = tensor.view(-1)
+
+ batch_size = tensor.batch_size[0]
+
+ non_tensor_new = {}
+
+ for key, val in non_tensor.items():
+ non_tensor_new[key] = np.reshape(val, newshape=(batch_size, *val.shape[batch_dims:]))
+
+ return DataProto(batch=tensor, non_tensor_batch=non_tensor_new, meta_info=data.meta_info)
+
+
+def collate_fn(x: list['DataProtoItem']):
+ batch = []
+ non_tensor_batch = []
+ for data in x:
+ batch.append(data.batch)
+ non_tensor_batch.append(data.non_tensor_batch)
+ batch = torch.stack(batch).contiguous()
+ non_tensor_batch = list_of_dict_to_dict_of_list(non_tensor_batch)
+ for key, val in non_tensor_batch.items():
+ non_tensor_batch[key] = np.array(val, dtype=object)
+ return DataProto(batch=batch, non_tensor_batch=non_tensor_batch)
+
+
+@dataclass
+class DataProtoItem:
+ # TODO(zhangchi.usc1992) add consistency check
+ batch: TensorDict = None
+ non_tensor_batch: Dict = field(default_factory=dict)
+ meta_info: Dict = field(default_factory=dict)
+
+
+@dataclass
+class DataProto:
+ """
+ A DataProto is a data structure that aims to provide a standard protocol for data exchange between functions.
+ It contains a batch (TensorDict) and a meta_info (Dict). The batch is a TensorDict https://pytorch.org/tensordict/.
+ TensorDict allows you to manipulate a dictionary of Tensors like a single Tensor. Ideally, the tensors with the
+ same batch size should be put inside batch.
+ """
+ batch: TensorDict = None
+ non_tensor_batch: Dict = field(default_factory=dict)
+ meta_info: Dict = field(default_factory=dict)
+
+ def __post_init__(self):
+ # perform necessary checking
+ self.check_consistency()
+
+ def __len__(self):
+ if self.batch is not None:
+ return self.batch.batch_size[0]
+ elif self.non_tensor_batch is not None and len(self.non_tensor_batch) > 0:
+ random_key = list(self.non_tensor_batch.keys())[0]
+ return self.non_tensor_batch[random_key].shape[0]
+ else:
+ return 0
+
+ def __getitem__(self, item):
+ tensor_data = self.batch[item]
+ non_tensor_data = {key: val[item] for key, val in self.non_tensor_batch.items()}
+ return DataProtoItem(batch=tensor_data, non_tensor_batch=non_tensor_data, meta_info=self.meta_info)
+
+ def __getstate__(self):
+ import io
+ buffer = io.BytesIO()
+ if tensordict.__version__ >= '0.5.0' and self.batch is not None:
+ self.batch = self.batch.contiguous()
+ self.batch = self.batch.consolidate()
+ torch.save(self.batch, buffer)
+ buffer_bytes = buffer.getvalue()
+ return buffer_bytes, self.non_tensor_batch, self.meta_info
+
+ def __setstate__(self, data):
+ import io
+ batch_deserialized_bytes, non_tensor_batch, meta_info = data
+ batch_deserialized = io.BytesIO(initial_bytes=batch_deserialized_bytes)
+ batch = torch.load(batch_deserialized,
+ weights_only=False,
+ map_location='cpu' if not torch.cuda.is_available() else None)
+ self.batch = batch
+ self.non_tensor_batch = non_tensor_batch
+ self.meta_info = meta_info
+
+ def save_to_disk(self, filepath):
+ with open(filepath, 'wb') as f:
+ pickle.dump(self, f)
+
+ @staticmethod
+ def load_from_disk(filepath) -> 'DataProto':
+ with open(filepath, 'rb') as f:
+ data = pickle.load(f)
+ return data
+
+ def print_size(self, prefix=""):
+ size_of_tensordict = 0
+ for key, tensor in self.batch.items():
+ size_of_tensordict += tensor.element_size() * tensor.numel()
+ size_of_numpy_array = 0
+ for key, numpy_array in self.non_tensor_batch.items():
+ size_of_numpy_array += numpy_array.nbytes
+
+ size_of_numpy_array /= 1024**3
+ size_of_tensordict /= 1024**3
+
+ message = f'Size of tensordict: {size_of_tensordict} GB, size of non_tensor_batch: {size_of_numpy_array} GB'
+
+ if prefix:
+ message = f'{prefix}, ' + message
+ print(message)
+
+ def check_consistency(self):
+ """Check the consistency of the DataProto. Mainly for batch and non_tensor_batch
+ We expose this function as a public one so that user can call themselves directly
+ """
+ if self.batch is not None:
+ assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1'
+
+ if self.non_tensor_batch is not None:
+ for key, val in self.non_tensor_batch.items():
+ assert isinstance(val, np.ndarray)
+
+ if self.batch is not None and len(self.non_tensor_batch) != 0:
+ # TODO: we can actually lift this restriction if needed
+ assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1 when non_tensor_batch is not empty.'
+
+ batch_size = self.batch.batch_size[0]
+ for key, val in self.non_tensor_batch.items():
+ assert isinstance(
+ val, np.ndarray
+ ) and val.dtype == object, 'data in the non_tensor_batch must be a numpy.array with dtype=object'
+ assert val.shape[
+ 0] == batch_size, f'key {key} length {len(val)} is not equal to batch size {batch_size}'
+
+ @classmethod
+ def from_single_dict(cls, data: Dict[str, Union[torch.Tensor, np.ndarray]], meta_info=None):
+ tensors = {}
+ non_tensors = {}
+
+ for key, val in data.items():
+ if isinstance(val, torch.Tensor):
+ tensors[key] = val
+ elif isinstance(val, np.ndarray):
+ non_tensors[key] = val
+ else:
+ raise ValueError(f'Unsupported type in data {type(val)}')
+
+ return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
+
+ @classmethod
+ def from_dict(cls, tensors: Dict[str, torch.Tensor], non_tensors=None, meta_info=None, num_batch_dims=1):
+ """Create a DataProto from a dict of tensors. This assumes that
+ 1. All the tensor in tensors have the same dim0
+ 2. Only dim0 is the batch dim
+ """
+ assert len(tensors) > 0, 'tensors must not be empty'
+ assert num_batch_dims > 0, 'num_batch_dims must be greater than zero'
+ if non_tensors is not None:
+ assert num_batch_dims == 1, 'only support num_batch_dims=1 when non_tensors is not None.'
+
+ if meta_info is None:
+ meta_info = {}
+ if non_tensors is None:
+ non_tensors = {}
+
+ assert isinstance(non_tensors, dict)
+
+ # get and check batch size
+ batch_size = None
+ pivot_key = None
+ for key, tensor in tensors.items():
+ if batch_size is None:
+ batch_size = tensor.shape[:num_batch_dims]
+ pivot_key = key
+ else:
+ current_batch = tensor.shape[:num_batch_dims]
+ assert batch_size == current_batch, \
+ f'Not all the tensor in tensors have the same batch size with batch_dims={num_batch_dims}. Got {pivot_key} has {batch_size}, {key} has {current_batch}'
+
+ for key, val in non_tensors.items():
+ non_tensors[key] = np.array(val, dtype=object)
+
+ tensor_dict = TensorDict(source=tensors, batch_size=batch_size)
+ return cls(batch=tensor_dict, non_tensor_batch=non_tensors, meta_info=meta_info)
+
+ def to(self, device) -> 'DataProto':
+ """move the batch to device
+
+ Args:
+ device (torch.device, str): torch device
+
+ Returns:
+ DataProto: the current DataProto
+
+ """
+ if self.batch is not None:
+ self.batch = self.batch.to(device)
+ return self
+
+ def select(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None, deepcopy=False) -> 'DataProto':
+ """Select a subset of the DataProto via batch_keys and meta_info_keys
+
+ Args:
+ batch_keys (list, optional): a list of strings indicating the keys in batch to select
+ meta_info_keys (list, optional): a list of keys indicating the meta info to select
+
+ Returns:
+ DataProto: the DataProto with the selected batch_keys and meta_info_keys
+ """
+ # TODO (zhangchi.usc1992) whether to copy
+ if batch_keys is not None:
+ batch_keys = tuple(batch_keys)
+ sub_batch = self.batch.select(*batch_keys)
+ else:
+ sub_batch = self.batch
+
+ if non_tensor_batch_keys is not None:
+ non_tensor_batch = {key: val for key, val in self.non_tensor_batch.items() if key in non_tensor_batch_keys}
+ else:
+ non_tensor_batch = self.non_tensor_batch
+
+ if deepcopy:
+ non_tensor_batch = copy.deepcopy(non_tensor_batch)
+
+ if meta_info_keys is not None:
+ sub_meta_info = {key: val for key, val in self.meta_info.items() if key in meta_info_keys}
+ else:
+ sub_meta_info = self.meta_info
+
+ if deepcopy:
+ sub_meta_info = copy.deepcopy(sub_meta_info)
+
+ return DataProto(batch=sub_batch, non_tensor_batch=non_tensor_batch, meta_info=sub_meta_info)
+
+ def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None) -> 'DataProto':
+ """Pop a subset of the DataProto via `batch_keys` and `meta_info_keys`
+
+ Args:
+ batch_keys (list, optional): a list of strings indicating the keys in batch to pop
+ meta_info_keys (list, optional): a list of keys indicating the meta info to pop
+
+ Returns:
+ DataProto: the DataProto with the poped batch_keys and meta_info_keys
+ """
+ assert batch_keys is not None
+ if meta_info_keys is None:
+ meta_info_keys = []
+ if non_tensor_batch_keys is None:
+ non_tensor_batch_keys = []
+
+ tensors = {}
+ # tensor batch
+ for key in batch_keys:
+ assert key in self.batch.keys()
+ tensors[key] = self.batch.pop(key)
+ non_tensors = {}
+ # non tensor batch
+ for key in non_tensor_batch_keys:
+ assert key in self.non_tensor_batch.keys()
+ non_tensors[key] = self.non_tensor_batch.pop(key)
+ meta_info = {}
+ for key in meta_info_keys:
+ assert key in self.meta_info.keys()
+ meta_info[key] = self.meta_info.pop(key)
+ return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
+
+ def rename(self, old_keys=None, new_keys=None) -> 'DataProto':
+ """
+ Note that this function only rename the key in the batch
+ """
+
+ def validate_input(keys):
+ if keys is not None:
+ if isinstance(keys, str):
+ keys = [keys]
+ elif isinstance(keys, list):
+ pass
+ else:
+ raise TypeError(f'keys must be a list or a string, but got {type(keys)}')
+ return keys
+
+ old_keys = validate_input(old_keys)
+ new_keys = validate_input(new_keys)
+
+ if len(new_keys) != len(old_keys):
+ raise ValueError(
+ f'new_keys and old_keys must have the same length, but got {len(new_keys)} and {len(old_keys)}')
+
+ self.batch.rename_key_(tuple(old_keys), tuple(new_keys))
+
+ return self
+
+ def union(self, other: 'DataProto') -> 'DataProto':
+ """Union with another DataProto. Union batch and meta_info separately.
+ Throw an error if
+ - there are conflict keys in batch and they are not equal
+ - the batch size of two data batch is not the same
+ - there are conflict keys in meta_info and they are not the same.
+
+ Args:
+ other (DataProto): another DataProto to union
+
+ Returns:
+ DataProto: the DataProto after union
+ """
+ self.batch = union_tensor_dict(self.batch, other.batch)
+ self.non_tensor_batch = union_numpy_dict(self.non_tensor_batch, other.non_tensor_batch)
+ self.meta_info = union_two_dict(self.meta_info, other.meta_info)
+ return self
+
+ def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader_kwargs=None):
+ """Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
+ dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details.
+
+ Args:
+ mini_batch_size (int): mini-batch size when iterating the dataset. We require that
+ ``batch.batch_size[0] % mini_batch_size == 0``
+ epochs (int): number of epochs when iterating the dataset.
+ dataloader_kwargs: internally, it returns a DataLoader over the batch.
+ The dataloader_kwargs is the kwargs passed to the DataLoader
+
+ Returns:
+ Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is
+ ``self.batch.batch_size * epochs // mini_batch_size``
+ """
+ assert self.batch.batch_size[0] % mini_batch_size == 0, f"{self.batch.batch_size[0]} % {mini_batch_size} != 0"
+ # we can directly create a dataloader from TensorDict
+ if dataloader_kwargs is None:
+ dataloader_kwargs = {}
+
+ if seed is not None:
+ generator = torch.Generator()
+ generator.manual_seed(seed)
+ else:
+ generator = None
+
+ assert isinstance(dataloader_kwargs, Dict)
+ train_dataloader = DataLoader(dataset=self,
+ batch_size=mini_batch_size,
+ collate_fn=collate_fn,
+ generator=generator,
+ **dataloader_kwargs)
+
+ def get_data():
+ for _ in range(epochs):
+ for d in train_dataloader:
+ d.meta_info = self.meta_info
+ yield d
+
+ return iter(get_data())
+
+ def chunk(self, chunks: int) -> List['DataProto']:
+ """Split the batch among dim=0 into chunks. The meta_info is passed to each DataProto after split.
+
+ Args:
+ chunks (int): the number of chunks to split on dim=0
+
+ Returns:
+ List[DataProto]: a list of DataProto after splitting
+ """
+ assert len(
+ self) % chunks == 0, f'only support equal chunk. Got size of DataProto {len(self)} and chunk {chunks}.'
+
+ if self.batch is not None:
+ batch_lst = self.batch.chunk(chunks=chunks, dim=0)
+ else:
+ batch_lst = [None for _ in range(chunks)]
+
+ non_tensor_batch_lst = [{} for _ in range(chunks)]
+ for key, val in self.non_tensor_batch.items():
+ assert isinstance(val, np.ndarray)
+ non_tensor_lst = np.array_split(val, chunks)
+ assert len(non_tensor_lst) == chunks
+ for i in range(chunks):
+ non_tensor_batch_lst[i][key] = non_tensor_lst[i]
+
+ output = []
+ for i in range(chunks):
+ output.append(
+ DataProto(batch=batch_lst[i], non_tensor_batch=non_tensor_batch_lst[i], meta_info=self.meta_info))
+
+ return output
+
+ @staticmethod
+ def concat(data: List['DataProto']) -> 'DataProto':
+ """Concat a list of DataProto. The batch is concatenated among dim=0.
+ The meta_info is assumed to be identical and will use the first one.
+
+ Args:
+ data (List[DataProto]): list of DataProto
+
+ Returns:
+ DataProto: concatenated DataProto
+ """
+ batch_lst = []
+ for batch in data:
+ batch_lst.append(batch.batch)
+ if batch_lst[0] is not None:
+ new_batch = torch.cat(batch_lst, dim=0)
+ else:
+ new_batch = None
+
+ non_tensor_batch = list_of_dict_to_dict_of_list(list_of_dict=[d.non_tensor_batch for d in data])
+ for key, val in non_tensor_batch.items():
+ non_tensor_batch[key] = np.concatenate(val, axis=0)
+
+ return DataProto(batch=new_batch, non_tensor_batch=non_tensor_batch, meta_info=data[0].meta_info)
+
+ def reorder(self, indices):
+ """
+ Note that this operation is in-place
+ """
+ indices_np = indices.detach().numpy()
+ self.batch = self.batch[indices]
+ self.non_tensor_batch = {key: val[indices_np] for key, val in self.non_tensor_batch.items()}
+
+ def repeat(self, repeat_times=2, interleave=True):
+ """
+ Repeat the batch data a specified number of times.
+
+ Args:
+ repeat_times (int): Number of times to repeat the data.
+ interleave (bool): Whether to interleave the repeated data.
+
+ Returns:
+ DataProto: A new DataProto with repeated data.
+ """
+ if self.batch is not None:
+ if interleave:
+ # Interleave the data
+ repeated_tensors = {
+ key: tensor.repeat_interleave(repeat_times, dim=0) for key, tensor in self.batch.items()
+ }
+ else:
+ # Stack the data
+ repeated_tensors = {
+ key: tensor.unsqueeze(0).expand(repeat_times, *tensor.shape).reshape(-1, *tensor.shape[1:])
+ for key, tensor in self.batch.items()
+ }
+
+ repeated_batch = TensorDict(
+ source=repeated_tensors,
+ batch_size=(self.batch.batch_size[0] * repeat_times,),
+ )
+ else:
+ repeated_batch = None
+
+ repeated_non_tensor_batch = {}
+ for key, val in self.non_tensor_batch.items():
+ if interleave:
+ repeated_non_tensor_batch[key] = np.repeat(val, repeat_times, axis=0)
+ else:
+ repeated_non_tensor_batch[key] = np.tile(val, (repeat_times,) + (1,) * (val.ndim - 1))
+
+ return DataProto(
+ batch=repeated_batch,
+ non_tensor_batch=repeated_non_tensor_batch,
+ meta_info=self.meta_info,
+ )
+
+
+import ray
+
+
+@dataclass
+class DataProtoFuture:
+ """
+ DataProtoFuture aims to eliminate actual data fetching on driver. By doing so, the driver doesn't have to wait
+ for data so that asynchronous execution becomes possible.
+ DataProtoFuture contains a list of futures from another WorkerGroup of size world_size.
+ - collect_fn is a Callable that reduces the list of futures to a DataProto
+ - dispatch_fn is a Callable that partitions the DataProto into a list of DataProto of size world_size and then select
+
+ Potential issue: we can optimize dispatch_fn(collect_fn) such that only needed data is fetched on destination
+ - DataProtoFuture only supports directly passing from the output of a method to another input. You can't perform any
+ operation on the DataProtoFuture in driver.
+ """
+ collect_fn: Callable
+ futures: List[ray.ObjectRef]
+ dispatch_fn: Callable = None
+
+ @staticmethod
+ def concat(data: List[ray.ObjectRef]) -> 'DataProtoFuture':
+ output = DataProtoFuture(collect_fn=DataProto.concat, futures=data)
+ return output
+
+ def chunk(self, chunks: int) -> List['DataProtoFuture']:
+ from functools import partial
+
+ arg_future_lst = []
+ for i in range(chunks):
+ # note that we can't directly pass i and chunks
+ def dispatch_fn(x, i, chunks):
+ return x.chunk(chunks=chunks)[i]
+
+ arg_future = DataProtoFuture(collect_fn=self.collect_fn,
+ dispatch_fn=partial(dispatch_fn, i=i, chunks=chunks),
+ futures=self.futures)
+ arg_future_lst.append(arg_future)
+ return arg_future_lst
+
+ def get(self):
+ output = ray.get(self.futures) # dp_size.
+ for o in output:
+ assert isinstance(o, DataProto)
+ output = self.collect_fn(output) # select dp, concat
+ if self.dispatch_fn is not None:
+ output = self.dispatch_fn(output) # split in batch dim, select using dp
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py b/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd850b790c7ef7ea88515b58e629cad45c0c84e2
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+
+with open(os.path.join(version_folder, 'version/version')) as f:
+ __version__ = f.read().strip()
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/__init__.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..75846436cd1285259d2bae6d4a7f190aebed1a80
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .worker import Worker
+from .worker_group import WorkerGroup, ClassWithInitArgs, ResourcePool
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fdacb6d97bc5897be837863236f6f057a024739
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/decorator.py
@@ -0,0 +1,410 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from functools import wraps
+from typing import Dict, List, Tuple
+from types import FunctionType
+from verl.protocol import DataProtoFuture
+
+# here we add a magic number of avoid user-defined function already have this attribute
+MAGIC_ATTR = 'attrs_3141562937'
+
+
+class Dispatch(Enum):
+ RANK_ZERO = 0
+ ONE_TO_ALL = 1
+ ALL_TO_ALL = 2
+ MEGATRON_COMPUTE = 3
+ MEGATRON_PP_AS_DP = 4
+ MEGATRON_PP_ONLY = 5
+ MEGATRON_COMPUTE_PROTO = 6
+ MEGATRON_PP_AS_DP_PROTO = 7
+ DP_COMPUTE = 8
+ DP_COMPUTE_PROTO = 9
+ DP_COMPUTE_PROTO_WITH_FUNC = 10
+ DP_COMPUTE_METRIC = 11
+
+
+class Execute(Enum):
+ ALL = 0
+ RANK_ZERO = 1
+
+
+def _split_args_kwargs_data_proto(chunks, *args, **kwargs):
+ from verl.protocol import DataProto, DataProtoFuture
+ splitted_args = []
+ for arg in args:
+ assert isinstance(arg, (DataProto, DataProtoFuture))
+ splitted_args.append(arg.chunk(chunks=chunks))
+
+ splitted_kwargs = {}
+ for key, val in kwargs.items():
+ assert isinstance(val, (DataProto, DataProtoFuture))
+ splitted_kwargs[key] = val.chunk(chunks=chunks)
+
+ return splitted_args, splitted_kwargs
+
+
+def dispatch_one_to_all(worker_group, *args, **kwargs):
+ args = tuple([arg] * worker_group.world_size for arg in args)
+ kwargs = {k: [v] * worker_group.world_size for k, v in kwargs.items()}
+ return args, kwargs
+
+
+def dispatch_all_to_all(worker_group, *args, **kwargs):
+ return args, kwargs
+
+
+def collect_all_to_all(worker_group, output):
+ return output
+
+
+def dispatch_megatron_compute(worker_group, *args, **kwargs):
+ """
+ User passes in dp data. The data is dispatched to all tp/pp ranks with the same dp
+ """
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group,
+ MegatronWorkerGroup), f'worker_group must be MegatronWorkerGroup, Got {type(worker_group)}'
+
+ all_args = []
+ for arg in args:
+ assert isinstance(arg, (Tuple, List)) and len(arg) == worker_group.dp_size
+ transformed_args = []
+ for i in range(worker_group.world_size):
+ local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
+ transformed_args.append(arg[local_dp_rank])
+ all_args.append(transformed_args)
+ all_args = tuple(all_args)
+
+ all_kwargs = {}
+ for k, v in kwargs.items():
+ assert isinstance(v, (Tuple, List)) and len(v) == worker_group.dp_size
+ transformed_v = []
+ for i in range(worker_group.world_size):
+ local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
+ transformed_v.append(v[local_dp_rank])
+ all_kwargs[k] = transformed_v
+ return all_args, all_kwargs
+
+
+def collect_megatron_compute(worker_group, output):
+ """
+ Only collect the data from the tp=0 and pp=last and every dp ranks
+ """
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group, MegatronWorkerGroup)
+ output_in_dp = []
+ pp_size = worker_group.get_megatron_global_info().pp_size
+ for global_rank in range(worker_group.world_size):
+ local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
+ if local_rank_info.tp_rank == 0 and local_rank_info.pp_rank == pp_size - 1:
+ output_in_dp.append(output[global_rank])
+ return output_in_dp
+
+
+def dispatch_megatron_compute_data_proto(worker_group, *args, **kwargs):
+ """
+ All the args and kwargs must be DataProto. The batch will be chunked by dp_size and passed to each rank
+ """
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group, MegatronWorkerGroup)
+
+ splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.dp_size, *args, **kwargs)
+ return dispatch_megatron_compute(worker_group, *splitted_args, **splitted_kwargs)
+
+
+def _concat_data_proto_or_future(output: List):
+ from verl.protocol import DataProto, DataProtoFuture
+ import ray
+
+ # make sure all the elements in output has the same type
+ for o in output:
+ assert type(o) == type(output[0])
+
+ o = output[0]
+
+ if isinstance(o, DataProto):
+ return DataProto.concat(output)
+ elif isinstance(o, ray.ObjectRef):
+ return DataProtoFuture.concat(output)
+ else:
+ raise NotImplementedError
+
+
+def collect_megatron_compute_data_proto(worker_group, output):
+ """
+ Each output must be a DataProto. We concat the dim=0 of output
+ """
+ from verl.protocol import DataProto
+ import ray
+
+ output = collect_megatron_compute(worker_group, output)
+ for o in output:
+ assert isinstance(o, (DataProto, ray.ObjectRef)), f"expecting {o} to be DataProto, but got {type(o)}"
+
+ return _concat_data_proto_or_future(output)
+
+
+def dispatch_megatron_pp_as_dp(worker_group, *args, **kwargs):
+ """
+ treat pp as dp.
+ """
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group, MegatronWorkerGroup)
+
+ pp_size = worker_group.pp_size
+ dp_size = worker_group.dp_size
+
+ pp_dp_size = pp_size * dp_size
+
+ all_args = []
+ for arg in args:
+ assert isinstance(arg, (List, Tuple)) and len(arg) == pp_dp_size
+ transformed_args = []
+ for i in range(worker_group.world_size):
+ local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
+ local_pp_rank = worker_group.get_megatron_rank_info(rank=i).pp_rank
+ # compute the rank in arg. Note that the order is dp then pp
+ # Also note that the outputs within a pp group will be firstly allgathered, then only the output of pp0 will be collected.
+ # For pp=2 dp=4, a batch of data "ABCDEFGH" should be dispatched and collected in below order:
+ # dispatch: pp_allgther: collect:
+ # dp 0 1 2 3 dp 0 1 2 3
+ # pp +---------+ pp +-------------+
+ # 0 | A C E G | 0 | AB CD EF GH | ABCDEFGH
+ # 1 | B D F H | 1 | AB CD EF GH |
+ # +---------+ +-------------+
+ arg_rank = local_dp_rank * worker_group.pp_size + local_pp_rank
+
+ transformed_args.append(arg[arg_rank])
+ all_args.append(transformed_args)
+ all_args = tuple(all_args)
+
+ all_kwargs = {}
+ for k, v in kwargs.items():
+ assert isinstance(v, (List, Tuple)) and len(v) == pp_dp_size, f'expect len(v)=={pp_dp_size}, got {len(v)}'
+ transformed_v = []
+ for i in range(worker_group.world_size):
+ local_dp_rank = worker_group.get_megatron_rank_info(rank=i).dp_rank
+ local_pp_rank = worker_group.get_megatron_rank_info(rank=i).pp_rank
+ # compute the rank in arg. Note that the order is dp then pp
+ arg_rank = local_dp_rank * worker_group.pp_size + local_pp_rank
+ transformed_v.append(v[arg_rank])
+ all_kwargs[k] = transformed_v
+ return all_args, all_kwargs
+
+
+def collect_megatron_pp_as_dp(worker_group, output):
+ """
+ treat pp as dp. Only collect data on tp=0
+ """
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group, MegatronWorkerGroup)
+ output_in_dp = []
+ for global_rank in range(worker_group.world_size):
+ local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
+ if local_rank_info.tp_rank == 0 and local_rank_info.pp_rank == 0:
+ output_in_dp.append(output[global_rank])
+ return output_in_dp
+
+
+def collect_megatron_pp_only(worker_group, output):
+ """
+ Only collect output of megatron pp. This is useful when examine weight names as they are identical in tp/dp
+ """
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group, MegatronWorkerGroup)
+ output_in_pp = []
+ for global_rank in range(worker_group.world_size):
+ local_rank_info = worker_group.get_megatron_rank_info(rank=global_rank)
+ if local_rank_info.tp_rank == 0 and local_rank_info.dp_rank == 0:
+ output_in_pp.append(output[global_rank])
+ return output_in_pp
+
+
+def dispatch_megatron_pp_as_dp_data_proto(worker_group, *args, **kwargs):
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group, MegatronWorkerGroup)
+
+ pp_dp_size = worker_group.dp_size * worker_group.pp_size
+ splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(pp_dp_size, *args, **kwargs)
+ return dispatch_megatron_pp_as_dp(worker_group, *splitted_args, **splitted_kwargs)
+
+
+def collect_megatron_pp_as_dp_data_proto(worker_group, output):
+ from verl.protocol import DataProto
+ from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+ assert isinstance(worker_group, MegatronWorkerGroup)
+
+ output = collect_megatron_pp_as_dp(worker_group, output)
+ return _concat_data_proto_or_future(output)
+
+
+def dispatch_dp_compute(worker_group, *args, **kwargs):
+ from verl.single_controller.base.worker_group import WorkerGroup
+ assert isinstance(worker_group, WorkerGroup)
+ for arg in args:
+ assert isinstance(arg, (Tuple, List)) and len(arg) == worker_group.world_size
+ for k, v in kwargs.items():
+ assert isinstance(v, (Tuple, List)) and len(v) == worker_group.world_size
+ return args, kwargs
+
+
+def collect_dp_compute(worker_group, output):
+ from verl.single_controller.base.worker_group import WorkerGroup
+ assert isinstance(worker_group, WorkerGroup)
+ assert len(output) == worker_group.world_size
+ return output
+
+
+def dispatch_dp_compute_data_proto(worker_group, *args, **kwargs):
+ from verl.single_controller.base.worker_group import WorkerGroup
+ assert isinstance(worker_group, WorkerGroup)
+ splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args, **kwargs)
+ return splitted_args, splitted_kwargs
+
+
+def dispatch_dp_compute_data_proto_with_func(worker_group, *args, **kwargs):
+ from verl.single_controller.base.worker_group import WorkerGroup
+ assert isinstance(worker_group, WorkerGroup)
+ assert type(args[0]) == FunctionType # NOTE: The first one args is a function!
+
+ splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args[1:], **kwargs)
+ splitted_args_with_func = [[args[0]] * worker_group.world_size] + splitted_args
+ return splitted_args_with_func, splitted_kwargs
+
+
+def collect_dp_compute_data_proto(worker_group, output):
+ from verl.protocol import DataProto
+ import ray
+
+ for o in output:
+ assert isinstance(o, (DataProto, ray.ObjectRef)), f"expecting {o} to be DataProto, but got {type(o)}"
+
+ output = collect_dp_compute(worker_group, output)
+ return _concat_data_proto_or_future(output)
+
+
+def get_predefined_dispatch_fn(dispatch_mode):
+ predefined_dispatch_mode_fn = {
+ Dispatch.ONE_TO_ALL: {
+ 'dispatch_fn': dispatch_one_to_all,
+ 'collect_fn': collect_all_to_all,
+ },
+ Dispatch.ALL_TO_ALL: {
+ 'dispatch_fn': dispatch_all_to_all,
+ 'collect_fn': collect_all_to_all,
+ },
+ Dispatch.MEGATRON_COMPUTE: {
+ 'dispatch_fn': dispatch_megatron_compute,
+ 'collect_fn': collect_megatron_compute,
+ },
+ Dispatch.MEGATRON_PP_AS_DP: {
+ 'dispatch_fn': dispatch_megatron_pp_as_dp,
+ 'collect_fn': collect_megatron_pp_as_dp,
+ },
+ Dispatch.MEGATRON_PP_ONLY: {
+ 'dispatch_fn': dispatch_one_to_all,
+ 'collect_fn': collect_megatron_pp_only
+ },
+ Dispatch.MEGATRON_COMPUTE_PROTO: {
+ 'dispatch_fn': dispatch_megatron_compute_data_proto,
+ 'collect_fn': collect_megatron_compute_data_proto
+ },
+ Dispatch.MEGATRON_PP_AS_DP_PROTO: {
+ 'dispatch_fn': dispatch_megatron_pp_as_dp_data_proto,
+ 'collect_fn': collect_megatron_pp_as_dp_data_proto
+ },
+ Dispatch.DP_COMPUTE: {
+ 'dispatch_fn': dispatch_dp_compute,
+ 'collect_fn': collect_dp_compute
+ },
+ Dispatch.DP_COMPUTE_PROTO: {
+ 'dispatch_fn': dispatch_dp_compute_data_proto,
+ 'collect_fn': collect_dp_compute_data_proto
+ },
+ Dispatch.DP_COMPUTE_PROTO_WITH_FUNC: {
+ 'dispatch_fn': dispatch_dp_compute_data_proto_with_func,
+ 'collect_fn': collect_dp_compute_data_proto
+ },
+ Dispatch.DP_COMPUTE_METRIC: {
+ 'dispatch_fn': dispatch_dp_compute_data_proto,
+ 'collect_fn': collect_dp_compute
+ }
+ }
+ return predefined_dispatch_mode_fn[dispatch_mode]
+
+
+def get_predefined_execute_fn(execute_mode):
+ """
+ Note that here we only asks execute_all and execute_rank_zero to be implemented
+ Leave the choice of how these two functions handle argument 'blocking' to users
+ """
+ predefined_execute_mode_fn = {
+ Execute.ALL: {
+ 'execute_fn_name': 'execute_all'
+ },
+ Execute.RANK_ZERO: {
+ 'execute_fn_name': 'execute_rank_zero'
+ }
+ }
+ return predefined_execute_mode_fn[execute_mode]
+
+
+def _check_dispatch_mode(dispatch_mode):
+ assert isinstance(dispatch_mode,
+ (Dispatch, Dict)), f'dispatch_mode must be a Dispatch or a Dict. Got {dispatch_mode}'
+ if isinstance(dispatch_mode, Dict):
+ necessary_keys = ['dispatch_fn', 'collect_fn']
+ for key in necessary_keys:
+ assert key in dispatch_mode, f'key {key} should be in dispatch_mode if it is a dictionary'
+
+
+def _check_execute_mode(execute_mode):
+ assert isinstance(execute_mode, Execute), f'execute_mode must be a Execute. Got {execute_mode}'
+
+
+def _materialize_futures(*args, **kwargs):
+ new_args = []
+ for arg in args:
+ if isinstance(arg, DataProtoFuture):
+ arg = arg.get()
+ # add more type to materialize
+ new_args.append(arg)
+ for k, v in kwargs.items():
+ if isinstance(v, DataProtoFuture):
+ kwargs[k] = v.get()
+
+ new_args = tuple(new_args)
+ return new_args, kwargs
+
+
+def register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.ALL, blocking=True, materialize_futures=True):
+ _check_dispatch_mode(dispatch_mode=dispatch_mode)
+ _check_execute_mode(execute_mode=execute_mode)
+
+ def decorator(func):
+
+ @wraps(func)
+ def inner(*args, **kwargs):
+ if materialize_futures:
+ args, kwargs = _materialize_futures(*args, **kwargs)
+ return func(*args, **kwargs)
+
+ attrs = {'dispatch_mode': dispatch_mode, 'execute_mode': execute_mode, 'blocking': blocking}
+ setattr(inner, MAGIC_ATTR, attrs)
+ return inner
+
+ return decorator
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/__init__.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/worker.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d84d29f16420a5cf976d64f45ecbb599125c43c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/worker.py
@@ -0,0 +1,39 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass
+from verl.single_controller.base.worker import Worker, DistRankInfo, DistGlobalInfo
+
+
+class MegatronWorker(Worker):
+
+ def __init__(self, cuda_visible_devices=None) -> None:
+ super().__init__(cuda_visible_devices)
+
+ def get_megatron_global_info(self):
+ from megatron.core import parallel_state as mpu
+ tp_size = mpu.get_tensor_model_parallel_world_size()
+ dp_size = mpu.get_data_parallel_world_size()
+ pp_size = mpu.get_pipeline_model_parallel_world_size()
+ info = DistGlobalInfo(tp_size=tp_size, dp_size=dp_size, pp_size=pp_size)
+ return info
+
+ def get_megatron_rank_info(self):
+ from megatron.core import parallel_state as mpu
+ tp_rank = mpu.get_tensor_model_parallel_rank()
+ dp_rank = mpu.get_data_parallel_rank()
+ pp_rank = mpu.get_pipeline_model_parallel_rank()
+ info = DistRankInfo(tp_rank=tp_rank, dp_rank=dp_rank, pp_rank=pp_rank)
+ return info
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/worker_group.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/worker_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c21d309b75f1fc7e76b87c9436efc103570f50
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/megatron/worker_group.py
@@ -0,0 +1,51 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from .worker import DistRankInfo, DistGlobalInfo
+from verl.single_controller.base import ResourcePool, WorkerGroup
+
+
+class MegatronWorkerGroup(WorkerGroup):
+
+ def __init__(self, resource_pool: ResourcePool, **kwargs):
+ super().__init__(resource_pool=resource_pool, **kwargs)
+ self._megatron_rank_info = None
+ self._megatron_global_info: DistGlobalInfo = None
+
+ def init_megatron(self, default_megatron_kwargs: Dict = None):
+ raise NotImplementedError(f"MegatronWorkerGroup.init_megatron should be overwritten")
+
+ def get_megatron_rank_info(self, rank: int) -> DistRankInfo:
+ assert 0 <= rank < self.world_size, f'rank must be from [0, world_size), Got {rank}'
+ return self._megatron_rank_info[rank]
+
+ @property
+ def tp_size(self):
+ assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
+ return self._megatron_global_info.tp_size
+
+ @property
+ def dp_size(self):
+ assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
+ return self._megatron_global_info.dp_size
+
+ @property
+ def pp_size(self):
+ assert self._megatron_global_info is not None, "MegatronWorkerGroup._megatron_global_info must be initialized"
+ return self._megatron_global_info.pp_size
+
+ def get_megatron_global_info(self):
+ return self._megatron_global_info
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/register_center/__init__.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/register_center/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/register_center/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/register_center/ray.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/register_center/ray.py
new file mode 100644
index 0000000000000000000000000000000000000000..430290cf2683d882d35a83256aa363d959265a05
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/register_center/ray.py
@@ -0,0 +1,29 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ray
+
+
+@ray.remote
+class WorkerGroupRegisterCenter:
+
+ def __init__(self, rank_zero_info):
+ self.rank_zero_info = rank_zero_info
+
+ def get_rank_zero_info(self):
+ return self.rank_zero_info
+
+
+def create_worker_group_register_center(name, info):
+ return WorkerGroupRegisterCenter.options(name=name).remote(info)
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/worker.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad6bab9332b343cfcd3b8e4fdbe55010a995ab04
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/worker.py
@@ -0,0 +1,186 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+the class for Worker
+"""
+import os
+import socket
+from dataclasses import dataclass
+from verl.single_controller.base.decorator import register, Dispatch, Execute
+
+
+@dataclass
+class DistRankInfo:
+ tp_rank: int
+ dp_rank: int
+ pp_rank: int
+
+
+@dataclass
+class DistGlobalInfo:
+ tp_size: int
+ dp_size: int
+ pp_size: int
+
+
+class WorkerHelper:
+
+ def _get_node_ip(self):
+
+ def get_node_ip_by_sdk():
+ if os.getenv("WG_BACKEND", None) == "ray":
+ import ray
+ return ray._private.services.get_node_ip_address()
+ elif os.getenv("WG_BACKEND", None) == "torch_rpc":
+ from verl.single_controller.torchrpc.k8s_client import get_ip_addr
+ return get_ip_addr()
+ return None
+
+ host_ipv4 = os.getenv("MY_HOST_IP", None)
+ host_ipv6 = os.getenv("MY_HOST_IPV6", None)
+ host_ip_by_env = host_ipv4 or host_ipv6
+ host_ip_by_sdk = get_node_ip_by_sdk()
+
+ host_ip = host_ip_by_env or host_ip_by_sdk
+ return host_ip
+
+ def _get_free_port(self):
+ with socket.socket() as sock:
+ sock.bind(('', 0))
+ return sock.getsockname()[1]
+
+ def get_availale_master_addr_port(self):
+ return self._get_node_ip(), str(self._get_free_port())
+
+ def _get_pid(self):
+ return
+
+
+class WorkerMeta:
+ keys = [
+ "WORLD_SIZE", "RANK", "LOCAL_WORLD_SIZE", "LOCAL_RANK", "MASTER_ADDR", "MASTER_PORT", "CUDA_VISIBLE_DEVICES"
+ ]
+
+ def __init__(self, store) -> None:
+ self._store = store
+
+ def to_dict(self):
+ return {f"_{key.lower()}": self._store.get(f"_{key.lower()}", None) for key in WorkerMeta.keys}
+
+
+# we assume that in each WorkerGroup, there is a Master Worker
+class Worker(WorkerHelper):
+
+ def __new__(cls, *args, **kwargs):
+ instance = super().__new__(cls)
+
+ # note that here we use int to distinguish
+ disable_worker_init = int(os.environ.get('DISABLE_WORKER_INIT', 0))
+ if disable_worker_init:
+ return instance
+
+ rank = os.environ.get("RANK", None)
+ worker_group_prefix = os.environ.get("WG_PREFIX", None)
+
+ # when decorator @ray.remote applies, __new__ will be called while we don't want to apply _configure_before_init
+ if None not in [rank, worker_group_prefix] and 'ActorClass(' not in cls.__name__:
+ instance._configure_before_init(f"{worker_group_prefix}_register_center", int(rank))
+
+ return instance
+
+ def _configure_before_init(self, register_center_name: str, rank: int):
+ assert isinstance(rank, int), f"rank must be int, instead of {type(rank)}"
+
+ if rank == 0:
+ master_addr, master_port = self.get_availale_master_addr_port()
+ rank_zero_info = {
+ "MASTER_ADDR": master_addr,
+ "MASTER_PORT": master_port,
+ }
+
+ if os.getenv("WG_BACKEND", None) == "ray":
+ from verl.single_controller.base.register_center.ray import create_worker_group_register_center
+ self.register_center = create_worker_group_register_center(name=register_center_name,
+ info=rank_zero_info)
+
+ os.environ.update(rank_zero_info)
+
+ def __init__(self, cuda_visible_devices=None) -> None:
+ # construct a meta from envrionment variable. Note that the import must be inside the class because it is executed remotely
+ import os
+ world_size = int(os.environ['WORLD_SIZE'])
+ rank = int(os.environ['RANK'])
+ self._rank = rank
+ self._world_size = world_size
+
+ master_addr = os.environ["MASTER_ADDR"]
+ master_port = os.environ["MASTER_PORT"]
+
+ local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", "1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+ store = {
+ '_world_size': world_size,
+ '_rank': rank,
+ '_local_world_size': local_world_size,
+ '_local_rank': local_rank,
+ '_master_addr': master_addr,
+ '_master_port': master_port
+ }
+ if cuda_visible_devices is not None:
+ store['_cuda_visible_devices'] = cuda_visible_devices
+
+ meta = WorkerMeta(store=store)
+ self._configure_with_meta(meta=meta)
+
+ def _configure_with_meta(self, meta: WorkerMeta):
+ """
+ This function should only be called inside by WorkerGroup
+ """
+ assert isinstance(meta, WorkerMeta)
+ self.__dict__.update(meta.to_dict()) # this is hacky
+ # print(f"__dict__: {self.__dict__}")
+ for key in WorkerMeta.keys:
+ val = self.__dict__.get(f"_{key.lower()}", None)
+ if val is not None:
+ # print(f"set {key} to {val}")
+ os.environ[key] = str(val)
+ os.environ["REDIS_STORE_SERVER_HOST"] = str(self._master_addr).replace("[", "").replace(
+ "]", "") if self._master_addr else ""
+
+ def get_master_addr_port(self):
+ return self._master_addr, self._master_port
+
+ def get_cuda_visible_devices(self):
+ import os
+ cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "not set")
+ return cuda_visible_devices
+
+ @property
+ def world_size(self):
+ return self._world_size
+
+ @property
+ def rank(self):
+ return self._rank
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO_WITH_FUNC)
+ def execute_with_func_generator(self, func, *args, **kwargs):
+ ret_proto = func(self, *args, **kwargs)
+ return ret_proto
+
+ @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)
+ def execute_func_rank_zero(self, func, *args, **kwargs):
+ result = func(*args, **kwargs)
+ return result
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/base/worker_group.py b/code/RL_model/verl/Search-R1/verl/single_controller/base/worker_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd584580c5c7223309e41ac39a865bd48c58c7d4
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/base/worker_group.py
@@ -0,0 +1,196 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+the class of WorkerGroup
+"""
+import logging
+import threading
+import signal
+import time
+from typing import List, Any, Callable, Dict
+
+from verl.single_controller.base.decorator import MAGIC_ATTR, Dispatch, get_predefined_dispatch_fn, get_predefined_execute_fn
+
+
+class ResourcePool:
+
+ def __init__(self, process_on_nodes=None, max_collocate_count: int = 10, n_gpus_per_node=8) -> None:
+ if process_on_nodes is None:
+ process_on_nodes = []
+ self._store = process_on_nodes
+ self.max_collocate_count = max_collocate_count
+ self.n_gpus_per_node = n_gpus_per_node # this is left for future huawei GPU that contains 16 GPUs per node
+
+ def add_node(self, process_count):
+ self._store.append(process_count)
+
+ @property
+ def world_size(self):
+ return sum(self._store)
+
+ def __call__(self) -> Any:
+ return self._store
+
+ @property
+ def store(self):
+ return self._store
+
+ def local_world_size_list(self) -> List[int]:
+ nested_local_world_size_list = [
+ [local_world_size for _ in range(local_world_size)] for local_world_size in self._store
+ ]
+ return [item for row in nested_local_world_size_list for item in row]
+
+ def local_rank_list(self) -> List[int]:
+ nested_local_rank_list = [[i for i in range(local_world_size)] for local_world_size in self._store]
+ return [item for row in nested_local_rank_list for item in row]
+
+
+class ClassWithInitArgs:
+ """
+ This class stores a class constructor and the args/kwargs to construct the class.
+ It is used to instantiate the remote class.
+ """
+
+ def __init__(self, cls, *args, **kwargs) -> None:
+ self.cls = cls
+ self.args = args
+ self.kwargs = kwargs
+
+ # def add_arg(self, arg):
+ # self.args += (arg,)
+
+ # def add_kwarg(self, key, value):
+ # self.kwargs[key] = value
+
+ def __call__(self) -> Any:
+ return self.cls(*self.args, **self.kwargs)
+
+
+def check_workers_alive(workers: List, is_alive: Callable, gap_time: float = 1) -> None:
+ import time
+ while True:
+ for worker in workers:
+ if not is_alive(worker):
+ logging.warning(f"worker {worker} is not alive" + " sending signal to main thread")
+ signal.raise_signal(signal.SIGABRT)
+ time.sleep(gap_time)
+
+
+class WorkerGroup:
+
+ def __init__(self, resource_pool: ResourcePool, **kwargs) -> None:
+ self._is_init_with_detached_workers = True if resource_pool is None else False
+
+ if resource_pool is not None:
+ # handle the case when WorkGroup is attached to an existing one
+ self._procecss_dispatch_config = resource_pool()
+ else:
+ self._procecss_dispatch_config = None
+
+ self._workers = []
+ self._worker_names = []
+
+ self._master_addr = None
+ self._master_port = None
+
+ self._checker_thread: threading.Thread = None
+
+ def _is_worker_alive(self, worker):
+ raise NotImplementedError(f"WorkerGroup._is_worker_alive called, should be implemented in derived class.")
+
+ def _block_until_all_workers_alive(self) -> None:
+ while True:
+ all_state = [self._is_worker_alive(worker) for worker in self._workers]
+ if False in all_state:
+ time.sleep(1)
+ else:
+ break
+
+ def start_worker_aliveness_check(self, every_n_seconds=1) -> None:
+ # before starting checking worker aliveness, make sure all workers are already alive
+ self._block_until_all_workers_alive()
+
+ self._checker_thread = threading.Thread(target=check_workers_alive,
+ args=(self._workers, self._is_worker_alive, every_n_seconds))
+ self._checker_thread.start()
+
+ @property
+ def world_size(self):
+ return len(self._workers)
+
+ # execute_all_async and execute_rank_zero_async should be implemented by RayWorkerGroup, TorchRPCWorkerGroup,
+ # MegatronWorkerGroup, XperfWorkerGroup should skip
+
+ def _bind_worker_method(self, user_defined_cls, func_generator):
+ """
+ Bind the worker method to the WorkerGroup
+ """
+
+ for method_name in dir(user_defined_cls):
+
+ try:
+ method = getattr(user_defined_cls, method_name)
+ assert callable(method), f"{method_name} in {user_defined_cls} is not callable"
+ except Exception as e:
+ # if it is a property, it will fail because Class doesn't have instance property
+ continue
+
+ if hasattr(method, MAGIC_ATTR):
+ # this method is decorated by register
+ attribute = getattr(method, MAGIC_ATTR)
+ assert isinstance(attribute, Dict), f'attribute must be a dictionary. Got {type(attribute)}'
+ assert 'dispatch_mode' in attribute, f'attribute must contain dispatch_mode in its key'
+
+ dispatch_mode = attribute['dispatch_mode']
+ execute_mode = attribute['execute_mode']
+ blocking = attribute['blocking']
+
+ # get dispatch fn
+ if isinstance(dispatch_mode, Dispatch):
+ # get default dispatch fn
+ fn = get_predefined_dispatch_fn(dispatch_mode=dispatch_mode)
+ dispatch_fn = fn['dispatch_fn']
+ collect_fn = fn['collect_fn']
+ else:
+ assert isinstance(dispatch_mode, dict)
+ assert 'dispatch_fn' in dispatch_mode
+ assert 'collect_fn' in dispatch_mode
+ dispatch_fn = dispatch_mode['dispatch_fn']
+ collect_fn = dispatch_mode['collect_fn']
+
+ # get execute_fn_name
+ execute_mode = get_predefined_execute_fn(execute_mode=execute_mode)
+ wg_execute_fn_name = execute_mode['execute_fn_name']
+
+ # get execute_fn from string
+ try:
+ execute_fn = getattr(self, wg_execute_fn_name)
+ assert callable(execute_fn), 'execute_fn must be callable'
+ except Exception as e:
+ print(f'execute_fn {wg_execute_fn_name} is invalid')
+ raise
+
+ # bind a new method to the RayWorkerGroup
+ func = func_generator(self,
+ method_name,
+ dispatch_fn=dispatch_fn,
+ collect_fn=collect_fn,
+ execute_fn=execute_fn,
+ blocking=blocking)
+
+ try:
+ setattr(self, method_name, func)
+ except Exception as e:
+ raise ValueError(f'Fail to set method_name {method_name}')
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/ray/__init__.py b/code/RL_model/verl/Search-R1/verl/single_controller/ray/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5783448e68e7207e45303aaec3894e8ea838d1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/ray/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import RayResourcePool, RayClassWithInitArgs, RayWorkerGroup, create_colocated_worker_cls
+from .megatron import (MegatronRayWorkerGroup, DistRankInfo, DistGlobalInfo)
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py b/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaa1b00de398a08223e0b7bcb25be943bf614f5b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/ray/base.py
@@ -0,0 +1,459 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from typing import Dict, List, Any, Tuple
+
+import ray
+from ray.util import list_named_actors
+from ray.util.placement_group import placement_group, PlacementGroup
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy, NodeAffinitySchedulingStrategy
+from ray.experimental.state.api import get_actor
+
+from verl.single_controller.base import WorkerGroup, ResourcePool, ClassWithInitArgs, Worker
+
+__all__ = ['Worker']
+
+
+def get_random_string(length: int) -> str:
+ import random
+ import string
+ letters_digits = string.ascii_letters + string.digits
+ return ''.join(random.choice(letters_digits) for _ in range(length))
+
+
+def func_generator(self, method_name, dispatch_fn, collect_fn, execute_fn, blocking):
+
+ def func(*args, **kwargs):
+ args, kwargs = dispatch_fn(self, *args, **kwargs)
+ output = execute_fn(method_name, *args, **kwargs)
+ if blocking:
+ output = ray.get(output)
+ output = collect_fn(self, output)
+ return output
+
+ return func
+
+
+class RayResourcePool(ResourcePool):
+
+ def __init__(self,
+ process_on_nodes: List[int] = None,
+ use_gpu: bool = True,
+ name_prefix: str = "",
+ max_colocate_count: int = 5,
+ detached=False) -> None:
+ super().__init__(process_on_nodes, max_colocate_count)
+ self.use_gpu = use_gpu
+ # print(f"in RayProcessDispatchConfiguration: name_prefix = {name_prefix}")
+ self.name_prefix = name_prefix
+ self.pgs = None
+ self.detached = detached
+
+ def get_placement_groups(self, strategy="STRICT_PACK", name=None):
+ if self.pgs is not None:
+ return self.pgs
+
+ pg_name_prefix = name if name else \
+ f"{self.name_prefix}verl_group_{'_'.join([str(count) for count in self._store])}:"
+ # print(f"pg_name_prefix = {pg_name_prefix}")
+ pg_scheme = [[{
+ "CPU": self.max_collocate_count,
+ "GPU": 1
+ } if self.use_gpu else {
+ "CPU": self.max_collocate_count
+ } for _ in range(process_count)] for process_count in self._store]
+
+ lifetime = 'detached' if self.detached else None
+
+ pgs = [
+ placement_group(bundles=bundles, strategy=strategy, name=pg_name_prefix + str(idx), lifetime=lifetime)
+ for idx, bundles in enumerate(pg_scheme)
+ ]
+
+ ray.get([pg.ready() for pg in pgs])
+
+ self.pgs = pgs
+ return pgs
+
+
+def extract_pg_from_exist(resource_pools: Dict[str, RayResourcePool], src_role_names: List[str],
+ resource_pool: RayResourcePool) -> List:
+
+ src_pgs = [
+ pg for role_name, resource_pool in resource_pools.items() for pg in resource_pool.get_placement_groups()
+ if role_name in src_role_names
+ ]
+
+ sorted_src_pgs = sorted(src_pgs, key=lambda pg: pg.bundle_count, reverse=True)
+ sorted_process_on_nodes = sorted([(val, idx) for idx, val in enumerate(resource_pool.store)], reverse=True)
+
+ unsorted_pgs: List[Tuple[int, PlacementGroup]] = []
+ searching_idx = 0
+ for request_process, original_idx in sorted_process_on_nodes:
+ assert searching_idx < len(sorted_src_pgs), f"no enough nodes for request: searching {searching_idx} th node"
+ assert request_process <= sorted_src_pgs[searching_idx].bundle_count, \
+ f"requesting {request_process} processes, bundle count cannot satisfy"
+ unsorted_pgs.append((original_idx, sorted_src_pgs[searching_idx]))
+ searching_idx += 1
+
+ return [pg for _, pg in sorted(unsorted_pgs)]
+
+
+def merge_resource_pool(rp1: RayResourcePool, rp2: RayResourcePool) -> RayResourcePool:
+ assert rp1.use_gpu == rp2.use_gpu, 'Both RayResourcePool must either use_gpu or not'
+ assert rp1.max_collocate_count == rp2.max_collocate_count, 'Both RayResourcePool must has the same max_collocate_count'
+ assert rp1.n_gpus_per_node == rp2.n_gpus_per_node, 'Both RayResourcePool must has the same n_gpus_per_node'
+ assert rp1.detached == rp2.detached, 'Detached ResourcePool cannot be merged with non-detached ResourcePool'
+
+ new_store = rp1.store + rp2.store
+
+ merged = RayResourcePool(new_store, rp1.use_gpu, f"{rp1.name_prefix}_{rp2.name_prefix}")
+ merged.pgs = rp1.get_placement_groups() + rp2.get_placement_groups()
+
+ return merged
+
+
+class RayClassWithInitArgs(ClassWithInitArgs):
+
+ def __init__(self, cls, *args, **kwargs) -> None:
+ # self._options = kwargs.pop('options', dict())
+ super().__init__(cls, *args, **kwargs)
+ self._options = {}
+ self._additional_resource = {}
+
+ def set_additional_resource(self, additional_resource):
+ self._additional_resource = additional_resource
+
+ def update_options(self, options: Dict):
+ self._options.update(options)
+
+ def __call__(self,
+ placement_group,
+ placement_group_bundle_idx,
+ use_gpu: bool = True,
+ num_gpus=1,
+ sharing_with=None) -> Any:
+ if sharing_with is not None:
+ target_node_id = ray.get(sharing_with.get_node_id.remote())
+ cuda_visible_devices = ray.get(sharing_with.get_cuda_visible_devices.remote())
+ options = {"scheduling_strategy": NodeAffinitySchedulingStrategy(node_id=target_node_id, soft=False)}
+ return self.cls.options(**options).remote(*self.args,
+ cuda_visible_devices=cuda_visible_devices,
+ **self.kwargs)
+
+ options = {
+ "scheduling_strategy":
+ PlacementGroupSchedulingStrategy(placement_group=placement_group,
+ placement_group_bundle_index=placement_group_bundle_idx)
+ }
+ options.update(self._options)
+
+ if use_gpu:
+ options["num_gpus"] = num_gpus
+
+ if len(self._additional_resource) > 1:
+ for k, v in self._additional_resource.items():
+ options[k] = v
+
+ # print("cls:", self.cls)
+ # print("args: ", self.args)
+ # print("kwargs: ", self.kwargs)
+ return self.cls.options(**options).remote(*self.args, **self.kwargs)
+
+
+class RayWorkerGroup(WorkerGroup):
+
+ def __init__(self,
+ resource_pool: RayResourcePool = None,
+ ray_cls_with_init: RayClassWithInitArgs = None,
+ bin_pack: bool = True,
+ name_prefix: str = None,
+ detached=False,
+ worker_names=None,
+ **kwargs) -> None:
+ super().__init__(resource_pool=resource_pool, **kwargs)
+ self.ray_cls_with_init = ray_cls_with_init
+ self.name_prefix = get_random_string(length=6) if name_prefix is None else name_prefix
+
+ if worker_names is not None:
+ assert self._is_init_with_detached_workers
+ self._worker_names = worker_names
+
+ if self._is_init_with_detached_workers:
+ self._init_with_detached_workers(worker_names=worker_names)
+ else:
+ self._init_with_resource_pool(resource_pool=resource_pool,
+ ray_cls_with_init=ray_cls_with_init,
+ bin_pack=bin_pack,
+ detached=detached)
+
+ if ray_cls_with_init is not None:
+ self._bind_worker_method(self.ray_cls_with_init.cls, func_generator)
+
+ def _is_worker_alive(self, worker: ray.actor.ActorHandle):
+ worker_state_dict = get_actor(worker._actor_id.hex())
+ return worker_state_dict.get("state", "undefined") == "ALIVE" if worker_state_dict is not None else False
+
+ def _init_with_detached_workers(self, worker_names):
+ workers = [ray.get_actor(name=name) for name in worker_names]
+ self._workers = workers
+ self._world_size = len(worker_names)
+
+ def _init_with_resource_pool(self, resource_pool, ray_cls_with_init, bin_pack, detached):
+ use_gpu = resource_pool.use_gpu
+
+ strategy = "PACK"
+ if bin_pack:
+ strategy = "STRICT_PACK"
+ pgs = resource_pool.get_placement_groups(strategy=strategy)
+ world_size = resource_pool.world_size
+ self._world_size = world_size
+ # cia.add_kwarg("_world_size", world_size)
+ num_gpus = 1 / resource_pool.max_collocate_count
+
+ rank = -1
+ for pg_idx, local_world_size in enumerate(resource_pool.store):
+ pg = pgs[pg_idx]
+ assert local_world_size <= pg.bundle_count, \
+ f"when generating for {self.name_prefix}, for the "
+ for local_rank in range(local_world_size):
+ rank += 1
+
+ # we pass in environment variable at option so that Worker can use environment variable to set
+ env_vars = {
+ 'WORLD_SIZE': str(world_size),
+ 'RANK': str(rank),
+ 'WG_PREFIX': self.name_prefix,
+ 'WG_BACKEND': 'ray',
+ 'RAY_LOCAL_WORLD_SIZE': str(local_world_size),
+ 'RAY_LOCAL_RANK': str(local_rank),
+ }
+ if rank != 0:
+ env_vars['MASTER_ADDR'] = self._master_addr
+ env_vars['MASTER_PORT'] = self._master_port
+
+ import re
+ cia_name = type(ray_cls_with_init.cls).__name__
+ match = re.search(r"ActorClass\(([^)]+)\)", cia_name) # ray.remote(Obj) -> "ActorClass(Obj)"
+ cia_name = match.group(1) if match else cia_name # "ActorClass(Obj)" -> "Obj"
+ name = f"{self.name_prefix}{cia_name}_{pg_idx}:{local_rank}" # e.g. Worker_2:5
+
+ ray_cls_with_init.update_options({'runtime_env': {'env_vars': env_vars}, 'name': name})
+
+ if detached:
+ ray_cls_with_init.update_options({'lifetime': 'detached'})
+
+ # create a worker
+ worker = ray_cls_with_init(placement_group=pg,
+ placement_group_bundle_idx=local_rank,
+ use_gpu=use_gpu,
+ num_gpus=num_gpus)
+ self._workers.append(worker)
+ self._worker_names.append(name)
+
+ if rank == 0:
+ register_center_actor = None
+ for _ in range(120):
+ if f"{self.name_prefix}_register_center" not in list_named_actors():
+ time.sleep(1)
+ else:
+ register_center_actor = ray.get_actor(f"{self.name_prefix}_register_center")
+ break
+ assert register_center_actor is not None, f"failed to get register_center_actor: {self.name_prefix}_register_center in {list_named_actors(all_namespaces=True)}"
+ rank_zero_info = ray.get(register_center_actor.get_rank_zero_info.remote())
+ self._master_addr, self._master_port = rank_zero_info['MASTER_ADDR'], rank_zero_info['MASTER_PORT']
+ # print(f"rank_zero_info: {rank_zero_info}")
+ # print(f"master_addr: {self._master_addr}, master_port: {self._master_port}")
+
+ @property
+ def worker_names(self):
+ return self._worker_names
+
+ @classmethod
+ def from_detached(cls, worker_names=None, ray_cls_with_init=None):
+ worker_group = cls(resource_pool=None,
+ ray_cls_with_init=ray_cls_with_init,
+ name_prefix=None,
+ worker_names=worker_names)
+ return worker_group
+
+ def spawn(self, prefix_set):
+ """
+ spawn to a dictionary of worker groups, each with a subset of method with prefix.
+
+ """
+
+ def _rebind_actor_methods(worker_group, actor_name):
+ """
+ bind the method with actor_prefix to its original name
+ """
+ prefix: str = actor_name + '_'
+ for method_name in dir(worker_group):
+ if method_name.startswith(prefix):
+ # only valid when Python >= 3.9
+ original_method_name = method_name.removeprefix(prefix)
+ method = getattr(worker_group, method_name)
+ setattr(worker_group, original_method_name, method)
+
+ new_worker_group_dict = {}
+ for prefix in prefix_set:
+ new_worker_group = self.from_detached(worker_names=self._worker_names,
+ ray_cls_with_init=self.ray_cls_with_init)
+
+ _rebind_actor_methods(new_worker_group, prefix)
+ new_worker_group_dict[prefix] = new_worker_group
+ return new_worker_group_dict
+
+ def execute_rank_zero_sync(self, method_name: str, *args, **kwargs):
+ return ray.get(self.execute_all_async(method_name, **args, **kwargs))
+
+ def execute_rank_zero_async(self, method_name: str, *args, **kwargs):
+ remote_call = getattr(self._workers[0], method_name)
+ return remote_call.remote(*args, **kwargs)
+
+ def execute_rank_zero(self, method_name: str, *args, **kwargs):
+ return self.execute_rank_zero_async(method_name, *args, **kwargs)
+
+ def execute_all(self, method_name: str, *args, **kwargs):
+ return self.execute_all_async(method_name, *args, **kwargs)
+
+ def execute_all_sync(self, method_name: str, *args, **kwargs):
+ return ray.get(self.execute_all_async(method_name, *args, **kwargs))
+
+ def execute_all_async(self, method_name: str, *args, **kwargs):
+ # 这里我们假设,如果 args 和 kwargs 里面所有的参数都是 list,且所有的 list 长度都与 len(self._workers) 一致的话,我们会把
+ # list 中的每一个分别发到对应的 worker 上去
+ # print(f"execute_all_async: method {method_name}({args}, {kwargs})")
+ length = len(self._workers)
+ if all(isinstance(arg, list) for arg in args) and all(isinstance(kwarg, list) for kwarg in kwargs.values()):
+ if all(len(arg) == length for arg in args) and all(len(kwarg) == length for kwarg in kwargs.values()):
+ # print(f"splitting args and kwargs into {length} shards")
+ result = []
+ for i in range(length):
+ sliced_args = tuple(arg[i] for arg in args)
+ sliced_kwargs = {k: v[i] for k, v in kwargs.items()}
+ remote_call = getattr(self._workers[i], method_name)
+ result.append(remote_call.remote(*sliced_args, **sliced_kwargs))
+ return result
+
+ return [getattr(worker, method_name).remote(*args, **kwargs) for worker in self._workers]
+
+ @property
+ def master_address(self):
+ return self._master_addr
+
+ @property
+ def master_port(self):
+ return self._master_port
+
+ @property
+ def workers(self):
+ return self._workers
+
+ @property
+ def world_size(self):
+ return self._world_size
+
+
+"""
+Utilities that enables creating workers inside the same ray.Actor,
+with code written in separate ray.Actors.
+"""
+
+from unittest.mock import patch
+from verl.single_controller.base.decorator import MAGIC_ATTR
+import os
+
+
+def _bind_workers_method_to_parent(cls, key, user_defined_cls):
+ """
+ Binds the methods of each worker to the WorkerDict.
+ Note that we only bind public methods that are decorated by register
+ """
+ for method_name in dir(user_defined_cls):
+ try:
+ method = getattr(user_defined_cls, method_name)
+ assert callable(method), f"{method_name} in {user_defined_cls} is not callable"
+ except Exception as e:
+ # if it is a property, it will fail because Class doesn't have instance property
+ continue
+
+ if hasattr(method, MAGIC_ATTR):
+
+ def generate_function(name):
+
+ def func(self, *args, **kwargs):
+ # dispatch to the actual worker
+ return getattr(self.worker_dict[key], name)(*args, **kwargs)
+
+ return func
+
+ func = generate_function(method_name)
+ # pass MAGIC_ATTR for outer worker group
+ setattr(func, MAGIC_ATTR, getattr(method, MAGIC_ATTR))
+ try:
+ method_name_with_prefix = key + '_' + method_name
+ setattr(cls, method_name_with_prefix, func)
+ # print(f'Binding {method_name_with_prefix}')
+ except Exception as e:
+ raise ValueError(f'Fail to set method_name {method_name}')
+
+
+def _unwrap_ray_remote(cls):
+ if hasattr(cls, '__ray_actor_class__'):
+ cls = cls.__ray_actor_class__
+ return cls
+
+
+def create_colocated_worker_cls(class_dict: dict[str, RayClassWithInitArgs]):
+ """
+ This function should return a class instance that delegates the calls to every
+ cls in cls_dict
+ """
+ cls_dict = {}
+ init_args_dict = {}
+ worker_cls = None
+ for key, cls in class_dict.items():
+ if worker_cls == None:
+ worker_cls = cls.cls.__ray_actor_class__.__base__
+ else:
+ assert worker_cls == cls.cls.__ray_actor_class__.__base__, \
+ 'the worker class should be the same when share the same process'
+ cls_dict[key] = cls.cls
+ init_args_dict[key] = {'args': cls.args, 'kwargs': cls.kwargs}
+
+ assert cls_dict.keys() == init_args_dict.keys()
+
+ # TODO: create a class with customizable name
+ class WorkerDict(worker_cls):
+
+ def __init__(self):
+ super().__init__()
+ self.worker_dict = {}
+ for key, user_defined_cls in cls_dict.items():
+ user_defined_cls = _unwrap_ray_remote(user_defined_cls)
+ # directly instantiate the class without remote
+ with patch.dict(os.environ, {'DISABLE_WORKER_INIT': '1'}):
+ self.worker_dict[key] = user_defined_cls(*init_args_dict[key].get('args', ()),
+ **init_args_dict[key].get('kwargs', {}))
+
+ # now monkey-patch the methods from inner class to WorkerDict
+ for key, user_defined_cls in cls_dict.items():
+ user_defined_cls = _unwrap_ray_remote(user_defined_cls)
+ _bind_workers_method_to_parent(WorkerDict, key, user_defined_cls)
+
+ remote_cls = ray.remote(WorkerDict)
+ remote_cls = RayClassWithInitArgs(cls=remote_cls)
+ return remote_cls
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/ray/megatron.py b/code/RL_model/verl/Search-R1/verl/single_controller/ray/megatron.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cdb49f95a77dca20c6a8f67ee1b61cfd4a1e8fc
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/ray/megatron.py
@@ -0,0 +1,62 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional
+
+import ray
+
+from .base import RayWorkerGroup, RayResourcePool, RayClassWithInitArgs
+from verl.single_controller.base.megatron.worker import DistRankInfo, DistGlobalInfo
+from verl.single_controller.base.megatron.worker_group import MegatronWorkerGroup
+
+
+# NOTE(sgm): for opensource megatron-core
+class NVMegatronRayWorkerGroup(RayWorkerGroup, MegatronWorkerGroup):
+ """
+ MegatronWorkerGroup will query each worker of its megatron rank info and store it inside the WorkerGroup
+ so that the dispatcher can use it to dispatch data.
+ """
+
+ def __init__(self, resource_pool: RayResourcePool, ray_cls_with_init: RayClassWithInitArgs, **kwargs):
+ super().__init__(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, **kwargs)
+ self._megatron_rank_info: DistRankInfo = self.execute_all_sync(method_name='get_megatron_rank_info')
+ self._megatron_global_info: DistGlobalInfo = ray.get(
+ self.execute_rank_zero_async(method_name='get_megatron_global_info'))
+
+
+class MegatronRayWorkerGroup(RayWorkerGroup, MegatronWorkerGroup):
+ """
+ MegatronWorkerGroup will query each worker of its megatron rank info and store it inside the WorkerGroup
+ so that the dispatcher can use it to dispatch data.
+ """
+
+ def __init__(self,
+ resource_pool: RayResourcePool,
+ ray_cls_with_init: RayClassWithInitArgs,
+ default_megatron_kwargs: Dict = None,
+ **kwargs):
+ super().__init__(resource_pool=resource_pool,
+ ray_cls_with_init=ray_cls_with_init,
+ default_megatron_kwargs=default_megatron_kwargs,
+ **kwargs)
+ self.init_megatron(default_megatron_kwargs=default_megatron_kwargs)
+ self._megatron_rank_info: DistRankInfo = self.execute_all_sync(method_name='get_megatron_rank_info')
+ self._megatron_global_info: DistGlobalInfo = ray.get(
+ self.execute_rank_zero_async(method_name='get_megatron_global_info'))
+
+ def init_megatron(self, default_megatron_kwargs: Optional[Dict] = None):
+ # after super, we will call init of each worker
+ if not self._is_init_with_detached_workers:
+ # only init_megatron if the WorkerGroup is created from scratch
+ self.execute_all_sync(method_name='init_megatron', default_megatron_kwargs=default_megatron_kwargs)
diff --git a/code/RL_model/verl/Search-R1/verl/single_controller/version/version b/code/RL_model/verl/Search-R1/verl/single_controller/version/version
new file mode 100644
index 0000000000000000000000000000000000000000..7bcd0e3612da7c517106f9b581a8beb53d4b0a97
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/single_controller/version/version
@@ -0,0 +1 @@
+0.0.2
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/__init__.py b/code/RL_model/verl/Search-R1/verl/third_party/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/__init__.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..290c83781e45d91cfae4643ea72166be65879bf4
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/__init__.py
@@ -0,0 +1,51 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from importlib.metadata import version, PackageNotFoundError
+
+
+def get_version(pkg):
+ try:
+ return version(pkg)
+ except PackageNotFoundError:
+ return None
+
+
+package_name = 'vllm'
+package_version = get_version(package_name)
+
+if package_version == '0.3.1':
+ vllm_version = '0.3.1'
+ from .vllm_v_0_3_1.llm import LLM
+ from .vllm_v_0_3_1.llm import LLMEngine
+ from .vllm_v_0_3_1 import parallel_state
+elif package_version == '0.4.2':
+ vllm_version = '0.4.2'
+ from .vllm_v_0_4_2.llm import LLM
+ from .vllm_v_0_4_2.llm import LLMEngine
+ from .vllm_v_0_4_2 import parallel_state
+elif package_version == '0.5.4':
+ vllm_version = '0.5.4'
+ from .vllm_v_0_5_4.llm import LLM
+ from .vllm_v_0_5_4.llm import LLMEngine
+ from .vllm_v_0_5_4 import parallel_state
+elif package_version == '0.6.3':
+ vllm_version = '0.6.3'
+ from .vllm_v_0_6_3.llm import LLM
+ from .vllm_v_0_6_3.llm import LLMEngine
+ from .vllm_v_0_6_3 import parallel_state
+else:
+ raise ValueError(
+ f'vllm version {package_version} not supported. Currently supported versions are 0.3.1, 0.4.2, 0.5.4 and 0.6.3.'
+ )
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/__init__.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae8f3b8f62fb62a909f4dfe66ede389b64e61b9
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
@@ -0,0 +1,228 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
+import argparse
+import dataclasses
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import torch.nn as nn
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
+from transformers import PretrainedConfig
+from .config import ModelConfig
+
+
+@dataclass
+class EngineArgs:
+ """Arguments for vLLM engine."""
+ model_hf_config: PretrainedConfig = None
+ dtype: str = 'auto'
+ kv_cache_dtype: str = 'auto'
+ seed: int = 0
+ max_model_len: Optional[int] = None
+ worker_use_ray: bool = False
+ pipeline_parallel_size: int = 1
+ tensor_parallel_size: int = 1
+ max_parallel_loading_workers: Optional[int] = None
+ block_size: int = 16
+ swap_space: int = 4 # GiB
+ gpu_memory_utilization: float = 0.90
+ max_num_batched_tokens: Optional[int] = None
+ max_num_seqs: int = 256
+ max_paddings: int = 256
+ disable_log_stats: bool = False
+ revision: Optional[str] = None
+ tokenizer_revision: Optional[str] = None
+ quantization: Optional[str] = None
+ load_format: str = 'model'
+ enforce_eager: bool = False
+ max_context_len_to_capture: int = 8192
+ disable_custom_all_reduce: bool = False
+ enable_lora: bool = False
+ max_loras: int = 1
+ max_lora_rank: int = 16
+ lora_extra_vocab_size: int = 256
+ lora_dtype = 'auto'
+ max_cpu_loras: Optional[int] = None
+ device: str = 'cuda'
+
+ @staticmethod
+ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+ """Shared CLI arguments for vLLM engine."""
+ # Model arguments
+ # TODO(shengguangming): delete the unused args
+ parser.add_argument('--model',
+ type=str,
+ default='facebook/opt-125m',
+ help='name or path of the huggingface model to use')
+ parser.add_argument('--tokenizer',
+ type=str,
+ default=EngineArgs.tokenizer,
+ help='name or path of the huggingface tokenizer to use')
+ parser.add_argument('--revision',
+ type=str,
+ default=None,
+ help='the specific model version to use. It can be a branch '
+ 'name, a tag name, or a commit id. If unspecified, will use '
+ 'the default version.')
+ parser.add_argument('--tokenizer-revision',
+ type=str,
+ default=None,
+ help='the specific tokenizer version to use. It can be a branch '
+ 'name, a tag name, or a commit id. If unspecified, will use '
+ 'the default version.')
+ parser.add_argument('--tokenizer-mode',
+ type=str,
+ default=EngineArgs.tokenizer_mode,
+ choices=['auto', 'slow'],
+ help='tokenizer mode. "auto" will use the fast '
+ 'tokenizer if available, and "slow" will '
+ 'always use the slow tokenizer.')
+ parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface')
+ parser.add_argument('--download-dir',
+ type=str,
+ default=EngineArgs.download_dir,
+ help='directory to download and load the weights, '
+ 'default to the default cache dir of '
+ 'huggingface')
+ parser.add_argument('--load-format',
+ type=str,
+ default=EngineArgs.load_format,
+ choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
+ help='The format of the model weights to load. '
+ '"auto" will try to load the weights in the safetensors format '
+ 'and fall back to the pytorch bin format if safetensors format '
+ 'is not available. '
+ '"pt" will load the weights in the pytorch bin format. '
+ '"safetensors" will load the weights in the safetensors format. '
+ '"npcache" will load the weights in pytorch format and store '
+ 'a numpy cache to speed up the loading. '
+ '"dummy" will initialize the weights with random values, '
+ 'which is mainly for profiling.')
+ parser.add_argument('--dtype',
+ type=str,
+ default=EngineArgs.dtype,
+ choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+ help='data type for model weights and activations. '
+ 'The "auto" option will use FP16 precision '
+ 'for FP32 and FP16 models, and BF16 precision '
+ 'for BF16 models.')
+ parser.add_argument('--max-model-len',
+ type=int,
+ default=None,
+ help='model context length. If unspecified, '
+ 'will be automatically derived from the model.')
+ # Parallel arguments
+ parser.add_argument('--worker-use-ray',
+ action='store_true',
+ help='use Ray for distributed serving, will be '
+ 'automatically set when using more than 1 GPU')
+ parser.add_argument('--pipeline-parallel-size',
+ '-pp',
+ type=int,
+ default=EngineArgs.pipeline_parallel_size,
+ help='number of pipeline stages')
+ parser.add_argument('--tensor-parallel-size',
+ '-tp',
+ type=int,
+ default=EngineArgs.tensor_parallel_size,
+ help='number of tensor parallel replicas')
+ # KV cache arguments
+ parser.add_argument('--block-size',
+ type=int,
+ default=EngineArgs.block_size,
+ choices=[8, 16, 32],
+ help='token block size')
+ # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
+ parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed')
+ parser.add_argument('--swap-space',
+ type=int,
+ default=EngineArgs.swap_space,
+ help='CPU swap space size (GiB) per GPU')
+ parser.add_argument('--gpu-memory-utilization',
+ type=float,
+ default=EngineArgs.gpu_memory_utilization,
+ help='the percentage of GPU memory to be used for'
+ 'the model executor')
+ parser.add_argument('--max-num-batched-tokens',
+ type=int,
+ default=EngineArgs.max_num_batched_tokens,
+ help='maximum number of batched tokens per '
+ 'iteration')
+ parser.add_argument('--max-num-seqs',
+ type=int,
+ default=EngineArgs.max_num_seqs,
+ help='maximum number of sequences per iteration')
+ parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics')
+ # Quantization settings.
+ parser.add_argument('--quantization',
+ '-q',
+ type=str,
+ choices=['awq', None],
+ default=None,
+ help='Method used to quantize the weights')
+ return parser
+
+ @classmethod
+ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
+ # Get the list of attributes of this dataclass.
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
+ # Set the attributes from the parsed arguments.
+ engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+ return engine_args
+
+ def create_engine_configs(
+ self,
+ ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
+ device_config = DeviceConfig(self.device)
+ model_config = ModelConfig(self.model_hf_config, self.dtype, self.seed, self.load_format, self.revision,
+ self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager,
+ self.max_context_len_to_capture)
+ cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype,
+ model_config.get_sliding_window())
+ parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray,
+ self.max_parallel_loading_workers, self.disable_custom_all_reduce)
+ scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len,
+ self.max_paddings)
+ lora_config = LoRAConfig(max_lora_rank=self.max_lora_rank,
+ max_loras=self.max_loras,
+ lora_extra_vocab_size=self.lora_extra_vocab_size,
+ lora_dtype=self.lora_dtype,
+ max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else
+ None) if self.enable_lora else None
+ return (model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config)
+
+
+@dataclass
+class AsyncEngineArgs(EngineArgs):
+ """Arguments for asynchronous vLLM engine."""
+ engine_use_ray: bool = False
+ disable_log_requests: bool = False
+ max_log_len: Optional[int] = None
+
+ @staticmethod
+ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+ parser = EngineArgs.add_cli_args(parser)
+ parser.add_argument('--engine-use-ray',
+ action='store_true',
+ help='use Ray to start the LLM engine in a '
+ 'separate process as the server process.')
+ parser.add_argument('--disable-log-requests', action='store_true', help='disable logging requests')
+ parser.add_argument('--max-log-len',
+ type=int,
+ default=None,
+ help='max number of prompt characters or prompt '
+ 'ID numbers being printed in log. '
+ 'Default: unlimited.')
+ return parser
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/config.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1fead86283e1c9594b7556555158a6dc72e6f0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/config.py
@@ -0,0 +1,577 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
+
+from typing import Optional, Union, ClassVar
+from dataclasses import dataclass
+import torch
+from transformers import PretrainedConfig
+from packaging.version import Version
+
+from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
+from vllm.utils import get_cpu_memory, is_hip, get_nvcc_cuda_version
+
+logger = init_logger(__name__)
+
+_GB = 1 << 30
+
+
+class ModelConfig:
+ """Configuration for the model.
+
+ Args:
+ model: Name or path of the huggingface model to use.
+ tokenizer: Name or path of the huggingface tokenizer to use.
+ tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
+ available, and "slow" will always use the slow tokenizer.
+ trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+ downloading the model and tokenizer.
+ download_dir: Directory to download and load the weights, default to the
+ default cache directory of huggingface.
+ load_format: The format of the model weights to load:
+ "auto" will try to load the weights in the safetensors format and
+ fall back to the pytorch bin format if safetensors format is
+ not available.
+ "pt" will load the weights in the pytorch bin format.
+ "safetensors" will load the weights in the safetensors format.
+ "npcache" will load the weights in pytorch format and store
+ a numpy cache to speed up the loading.
+ "dummy" will initialize the weights with random values, which is
+ mainly for profiling.
+ dtype: Data type for model weights and activations. The "auto" option
+ will use FP16 precision for FP32 and FP16 models, and BF16 precision
+ for BF16 models.
+ seed: Random seed for reproducibility.
+ revision: The specific model version to use. It can be a branch name,
+ a tag name, or a commit id. If unspecified, will use the default
+ version.
+ tokenizer_revision: The specific tokenizer version to use. It can be a
+ branch name, a tag name, or a commit id. If unspecified, will use
+ the default version.
+ max_model_len: Maximum length of a sequence (including prompt and
+ output). If None, will be derived from the model.
+ quantization: Quantization method that was used to quantize the model
+ weights. If None, we assume the model weights are not quantized.
+ enforce_eager: Whether to enforce eager execution. If True, we will
+ disable CUDA graph and always execute the model in eager mode.
+ If False, we will use CUDA graph and eager execution in hybrid.
+ max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode.
+ """
+
+ def __init__(
+ self,
+ hf_config: PretrainedConfig,
+ dtype: str,
+ seed: int,
+ load_format: str = 'model',
+ revision: Optional[str] = None,
+ tokenizer_revision: Optional[str] = None,
+ max_model_len: Optional[int] = None,
+ quantization: Optional[str] = None,
+ trust_remote_code: Optional[bool] = True,
+ enforce_eager: bool = False,
+ max_context_len_to_capture: Optional[int] = None,
+ ) -> None:
+ self.model = hf_config._name_or_path
+ self.tokenizer = hf_config._name_or_path
+ self.load_format = load_format
+ self.seed = seed
+ self.revision = revision
+ self.tokenizer_revision = tokenizer_revision
+ self.quantization = quantization
+ self.trust_remote_code = trust_remote_code
+ self.enforce_eager = enforce_eager
+ self.max_context_len_to_capture = max_context_len_to_capture
+
+ # self.hf_config = get_config(model, trust_remote_code, revision)
+ self.hf_config = hf_config
+ self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
+ self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
+ # self._verify_load_format()
+ # self._verify_tokenizer_mode()
+ self._verify_quantization()
+ self._verify_cuda_graph()
+
+ def _verify_load_format(self) -> None:
+ load_format = self.load_format.lower()
+ if load_format not in ["auto", "pt", "safetensors", "npcache", "dummy", "model"]:
+ raise ValueError(f"Unknown load format: {self.load_format}. Must be one of "
+ "'auto', 'pt', 'safetensors', 'npcache', 'dummy' or 'model'.")
+ self.load_format = load_format
+
+ # def _verify_tokenizer_mode(self) -> None:
+ # tokenizer_mode = self.tokenizer_mode.lower()
+ # if tokenizer_mode not in ["auto", "slow"]:
+ # raise ValueError(
+ # f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
+ # "either 'auto' or 'slow'.")
+ # self.tokenizer_mode = tokenizer_mode
+
+ def _verify_quantization(self) -> None:
+ supported_quantization = ["awq", "gptq", "squeezellm"]
+ rocm_not_supported_quantization = ["awq", "gptq"]
+ if self.quantization is not None:
+ self.quantization = self.quantization.lower()
+
+ # Parse quantization method from the HF model config, if available.
+ hf_quant_config = getattr(self.hf_config, "quantization_config", None)
+ if hf_quant_config is not None:
+ hf_quant_method = str(hf_quant_config["quant_method"]).lower()
+ if self.quantization is None:
+ self.quantization = hf_quant_method
+ elif self.quantization != hf_quant_method:
+ raise ValueError("Quantization method specified in the model config "
+ f"({hf_quant_method}) does not match the quantization "
+ f"method specified in the `quantization` argument "
+ f"({self.quantization}).")
+
+ if self.quantization is not None:
+ if self.quantization not in supported_quantization:
+ raise ValueError(f"Unknown quantization method: {self.quantization}. Must "
+ f"be one of {supported_quantization}.")
+ if is_hip() and self.quantization in rocm_not_supported_quantization:
+ raise ValueError(f"{self.quantization} quantization is currently not supported "
+ f"in ROCm.")
+ logger.warning(f"{self.quantization} quantization is not fully "
+ "optimized yet. The speed can be slower than "
+ "non-quantized models.")
+
+ def _verify_cuda_graph(self) -> None:
+ if self.max_context_len_to_capture is None:
+ self.max_context_len_to_capture = self.max_model_len
+ self.max_context_len_to_capture = min(self.max_context_len_to_capture, self.max_model_len)
+ if (self.quantization in ["gptq", "squeezellm"] and not self.enforce_eager):
+ # Related issue: https://github.com/vllm-project/vllm/issues/2147
+ logger.warning(f"{self.quantization} does not support CUDA graph "
+ "yet. Disabling CUDA graph.")
+ self.enforce_eager = True
+
+ def verify_with_parallel_config(
+ self,
+ parallel_config: "ParallelConfig",
+ ) -> None:
+ total_num_attention_heads = self.hf_config.num_attention_heads
+ tensor_parallel_size = parallel_config.tensor_parallel_size
+ if total_num_attention_heads % tensor_parallel_size != 0:
+ raise ValueError(f"Total number of attention heads ({total_num_attention_heads})"
+ " must be divisible by tensor parallel size "
+ f"({tensor_parallel_size}).")
+
+ total_num_hidden_layers = self.hf_config.num_hidden_layers
+ pipeline_parallel_size = parallel_config.pipeline_parallel_size
+ if total_num_hidden_layers % pipeline_parallel_size != 0:
+ raise ValueError(f"Total number of hidden layers ({total_num_hidden_layers}) "
+ "must be divisible by pipeline parallel size "
+ f"({pipeline_parallel_size}).")
+
+ def get_sliding_window(self) -> Optional[int]:
+ return getattr(self.hf_config, "sliding_window", None)
+
+ def get_vocab_size(self) -> int:
+ return self.hf_config.vocab_size
+
+ def get_hidden_size(self) -> int:
+ return self.hf_config.hidden_size
+
+ def get_head_size(self) -> int:
+ # FIXME(woosuk): This may not be true for all models.
+ return self.hf_config.hidden_size // self.hf_config.num_attention_heads
+
+ def get_total_num_kv_heads(self) -> int:
+ """Returns the total number of KV heads."""
+ # For GPTBigCode & Falcon:
+ # NOTE: for falcon, when new_decoder_architecture is True, the
+ # multi_query flag is ignored and we use n_head_kv for the number of
+ # KV heads.
+ falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
+ new_decoder_arch_falcon = (self.hf_config.model_type in falcon_model_types and
+ getattr(self.hf_config, "new_decoder_architecture", False))
+ if not new_decoder_arch_falcon and getattr(self.hf_config, "multi_query", False):
+ # Multi-query attention, only one KV head.
+ # Currently, tensor parallelism is not supported in this case.
+ return 1
+
+ attributes = [
+ # For Falcon:
+ "n_head_kv",
+ "num_kv_heads",
+ # For LLaMA-2:
+ "num_key_value_heads",
+ # For ChatGLM:
+ "multi_query_group_num",
+ ]
+ for attr in attributes:
+ num_kv_heads = getattr(self.hf_config, attr, None)
+ if num_kv_heads is not None:
+ return num_kv_heads
+
+ # For non-grouped-query attention models, the number of KV heads is
+ # equal to the number of attention heads.
+ return self.hf_config.num_attention_heads
+
+ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
+ """Returns the number of KV heads per GPU."""
+ total_num_kv_heads = self.get_total_num_kv_heads()
+ # If tensor parallelism is used, we divide the number of KV heads by
+ # the tensor parallel size. We will replicate the KV heads in the
+ # case where the number of KV heads is smaller than the tensor
+ # parallel size so each GPU has at least one KV head.
+ return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
+
+ def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+ total_num_hidden_layers = self.hf_config.num_hidden_layers
+ return total_num_hidden_layers // parallel_config.pipeline_parallel_size
+
+
+class CacheConfig:
+ """Configuration for the KV cache.
+
+ Args:
+ block_size: Size of a cache block in number of tokens.
+ gpu_memory_utilization: Fraction of GPU memory to use for the
+ vLLM execution.
+ swap_space: Size of the CPU swap space per GPU (in GiB).
+ cache_dtype: Data type for kv cache storage.
+ """
+
+ def __init__(
+ self,
+ block_size: int,
+ gpu_memory_utilization: float,
+ swap_space: int,
+ cache_dtype: str,
+ sliding_window: Optional[int] = None,
+ ) -> None:
+ self.block_size = block_size
+ self.gpu_memory_utilization = gpu_memory_utilization
+ self.swap_space_bytes = swap_space * _GB
+ self.cache_dtype = cache_dtype
+ self.sliding_window = sliding_window
+ self._verify_args()
+ self._verify_cache_dtype()
+
+ # Will be set after profiling.
+ self.num_gpu_blocks = None
+ self.num_cpu_blocks = None
+
+ def _verify_args(self) -> None:
+ if self.gpu_memory_utilization > 1.0:
+ raise ValueError("GPU memory utilization must be less than 1.0. Got "
+ f"{self.gpu_memory_utilization}.")
+
+ def _verify_cache_dtype(self) -> None:
+ if self.cache_dtype == "auto":
+ pass
+ elif self.cache_dtype == "fp8_e5m2":
+ nvcc_cuda_version = get_nvcc_cuda_version()
+ if nvcc_cuda_version < Version("11.8"):
+ raise ValueError("FP8 is not supported when cuda version is lower than 11.8.")
+ device_name = torch.cuda.get_device_name()
+ if "AMD" in device_name:
+ raise NotImplementedError("FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
+ logger.info("Using fp8_e5m2 data type to store kv cache. It reduces "
+ "the GPU memory footprint and boosts the performance. "
+ "But it may cause slight accuracy drop. "
+ "Currently we only support fp8 without scaling factors and "
+ "make e5m2 as a default format.")
+ else:
+ raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
+
+ def verify_with_parallel_config(
+ self,
+ parallel_config: "ParallelConfig",
+ ) -> None:
+ total_cpu_memory = get_cpu_memory()
+ # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
+ # group are in the same node. However, the GPUs may span multiple nodes.
+ num_gpus_per_node = parallel_config.tensor_parallel_size
+ cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
+
+ msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
+ f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
+ "allocated for the swap space.")
+ if cpu_memory_usage > 0.7 * total_cpu_memory:
+ raise ValueError("Too large swap space. " + msg)
+ elif cpu_memory_usage > 0.4 * total_cpu_memory:
+ logger.warning("Possibly too large swap space. " + msg)
+
+
+class ParallelConfig:
+ """Configuration for the distributed execution.
+
+ Args:
+ pipeline_parallel_size: Number of pipeline parallel groups.
+ tensor_parallel_size: Number of tensor parallel groups.
+ worker_use_ray: Whether to use Ray for model workers. Will be set to
+ True if either pipeline_parallel_size or tensor_parallel_size is
+ greater than 1.
+ max_parallel_loading_workers: Maximum number of multiple batches
+ when load model sequentially. To avoid RAM OOM when using tensor
+ parallel and large models.
+ disable_custom_all_reduce: Disable the custom all-reduce kernel and
+ fall back to NCCL.
+ """
+
+ def __init__(
+ self,
+ pipeline_parallel_size: int,
+ tensor_parallel_size: int,
+ worker_use_ray: bool,
+ max_parallel_loading_workers: Optional[int] = None,
+ disable_custom_all_reduce: bool = False,
+ ) -> None:
+ self.pipeline_parallel_size = pipeline_parallel_size
+ self.tensor_parallel_size = tensor_parallel_size
+ self.worker_use_ray = worker_use_ray
+ self.max_parallel_loading_workers = max_parallel_loading_workers
+ self.disable_custom_all_reduce = disable_custom_all_reduce
+
+ self.world_size = pipeline_parallel_size * tensor_parallel_size
+ if self.world_size > 1:
+ self.worker_use_ray = True
+ self._verify_args()
+
+ def _verify_args(self) -> None:
+ if self.pipeline_parallel_size > 1:
+ raise NotImplementedError("Pipeline parallelism is not supported yet.")
+ if not self.disable_custom_all_reduce and self.world_size > 1:
+ if is_hip():
+ self.disable_custom_all_reduce = True
+ logger.info("Disabled the custom all-reduce kernel because it is not "
+ "supported on AMD GPUs.")
+ elif self.pipeline_parallel_size > 1:
+ self.disable_custom_all_reduce = True
+ logger.info("Disabled the custom all-reduce kernel because it is not "
+ "supported with pipeline parallelism.")
+
+ # FIXME(woosuk): Fix the stability issues and re-enable the custom
+ # all-reduce kernel.
+ if not self.disable_custom_all_reduce and self.world_size > 1:
+ self.disable_custom_all_reduce = True
+ logger.info("Custom all-reduce kernels are temporarily disabled due to "
+ "stability issues. We will re-enable them once the issues are "
+ "resolved.")
+
+
+class SchedulerConfig:
+ """Scheduler configuration.
+
+ Args:
+ max_num_batched_tokens: Maximum number of tokens to be processed in
+ a single iteration.
+ max_num_seqs: Maximum number of sequences to be processed in a single
+ iteration.
+ max_model_len: Maximum length of a sequence (including prompt
+ and generated text).
+ max_paddings: Maximum number of paddings to be added to a batch.
+ """
+
+ def __init__(
+ self,
+ max_num_batched_tokens: Optional[int],
+ max_num_seqs: int,
+ max_model_len: int,
+ max_paddings: int,
+ ) -> None:
+ if max_num_batched_tokens is not None:
+ self.max_num_batched_tokens = max_num_batched_tokens
+ else:
+ # If max_model_len is too short, use 2048 as the default value for
+ # higher throughput.
+ self.max_num_batched_tokens = max(max_model_len, 2048)
+ self.max_num_seqs = max_num_seqs
+ self.max_model_len = max_model_len
+ self.max_paddings = max_paddings
+ self._verify_args()
+
+ def _verify_args(self) -> None:
+ if self.max_num_batched_tokens < self.max_model_len:
+ raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
+ f"smaller than max_model_len ({self.max_model_len}). "
+ "This effectively limits the maximum sequence length to "
+ "max_num_batched_tokens and makes vLLM reject longer "
+ "sequences. Please increase max_num_batched_tokens or "
+ "decrease max_model_len.")
+ if self.max_num_batched_tokens < self.max_num_seqs:
+ raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
+ "be greater than or equal to max_num_seqs "
+ f"({self.max_num_seqs}).")
+
+
+class DeviceConfig:
+
+ def __init__(self, device: str = "cuda") -> None:
+ self.device = torch.device(device)
+
+
+@dataclass
+class LoRAConfig:
+ max_lora_rank: int
+ max_loras: int
+ max_cpu_loras: Optional[int] = None
+ lora_dtype: Optional[torch.dtype] = None
+ lora_extra_vocab_size: int = 256
+ # This is a constant.
+ lora_vocab_padding_size: ClassVar[int] = 256
+
+ def __post_init__(self):
+ # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
+ possible_max_ranks = (8, 16, 32, 64)
+ possible_lora_extra_vocab_size = (0, 256, 512)
+ if self.max_lora_rank not in possible_max_ranks:
+ raise ValueError(f"max_lora_rank ({self.max_lora_rank}) must be one of "
+ f"{possible_max_ranks}.")
+ if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
+ raise ValueError(f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
+ f"must be one of {possible_lora_extra_vocab_size}.")
+ if self.max_loras < 1:
+ raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
+ if self.max_cpu_loras is None:
+ self.max_cpu_loras = self.max_loras
+ elif self.max_cpu_loras < self.max_loras:
+ raise ValueError(f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
+ f"max_loras ({self.max_loras})")
+
+ def verify_with_model_config(self, model_config: ModelConfig):
+ if self.lora_dtype in (None, "auto"):
+ self.lora_dtype = model_config.dtype
+ elif isinstance(self.lora_dtype, str):
+ self.lora_dtype = getattr(torch, self.lora_dtype)
+ if model_config.quantization is not None:
+ raise ValueError("LoRA is not supported with quantized models yet.")
+
+ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
+ if scheduler_config.max_num_batched_tokens > 65528:
+ raise ValueError("Due to limitations of the custom LoRA CUDA kernel, "
+ "max_num_batched_tokens must be <= 65528 when "
+ "LoRA is enabled.")
+
+
+_STR_DTYPE_TO_TORCH_DTYPE = {
+ "half": torch.float16,
+ "float16": torch.float16,
+ "float": torch.float32,
+ "float32": torch.float32,
+ "bfloat16": torch.bfloat16,
+}
+
+_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
+
+
+def _get_and_verify_dtype(
+ config: PretrainedConfig,
+ dtype: Union[str, torch.dtype],
+) -> torch.dtype:
+ # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
+ # because config.torch_dtype can be None.
+ config_dtype = getattr(config, "torch_dtype", None)
+ if config_dtype is None:
+ config_dtype = torch.float32
+
+ if isinstance(dtype, str):
+ dtype = dtype.lower()
+ if dtype == "auto":
+ if config_dtype == torch.float32:
+ # Following the common practice, we use float16 for float32
+ # models.
+ torch_dtype = torch.float16
+ else:
+ torch_dtype = config_dtype
+ else:
+ if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+ raise ValueError(f"Unknown dtype: {dtype}")
+ torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+ elif isinstance(dtype, torch.dtype):
+ torch_dtype = dtype
+ else:
+ raise ValueError(f"Unknown dtype: {dtype}")
+
+ if is_hip() and torch_dtype == torch.float32:
+ rocm_supported_dtypes = [
+ k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
+ ]
+ raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. "
+ f"Supported dtypes are {rocm_supported_dtypes}")
+
+ # Verify the dtype.
+ if torch_dtype != config_dtype:
+ if torch_dtype == torch.float32:
+ # Upcasting to float32 is allowed.
+ pass
+ elif config_dtype == torch.float32:
+ # Downcasting from float32 to float16 or bfloat16 is allowed.
+ pass
+ else:
+ # Casting between float16 and bfloat16 is allowed with a warning.
+ logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
+
+ return torch_dtype
+
+
+def _get_and_verify_max_len(
+ hf_config: PretrainedConfig,
+ max_model_len: Optional[int],
+) -> int:
+ """Get and verify the model's maximum length."""
+ derived_max_model_len = float("inf")
+ possible_keys = [
+ # OPT
+ "max_position_embeddings",
+ # GPT-2
+ "n_positions",
+ # MPT
+ "max_seq_len",
+ # ChatGLM2
+ "seq_length",
+ # Others
+ "max_sequence_length",
+ "max_seq_length",
+ "seq_len",
+ ]
+ for key in possible_keys:
+ max_len_key = getattr(hf_config, key, None)
+ if max_len_key is not None:
+ derived_max_model_len = min(derived_max_model_len, max_len_key)
+ if derived_max_model_len == float("inf"):
+ if max_model_len is not None:
+ # If max_model_len is specified, we use it.
+ return max_model_len
+
+ default_max_len = 2048
+ logger.warning("The model's config.json does not contain any of the following "
+ "keys to determine the original maximum length of the model: "
+ f"{possible_keys}. Assuming the model's maximum length is "
+ f"{default_max_len}.")
+ derived_max_model_len = default_max_len
+
+ rope_scaling = getattr(hf_config, "rope_scaling", None)
+ if rope_scaling is not None:
+ assert "factor" in rope_scaling
+ scaling_factor = rope_scaling["factor"]
+ if rope_scaling["type"] == "yarn":
+ derived_max_model_len = rope_scaling["original_max_position_embeddings"]
+ derived_max_model_len *= scaling_factor
+
+ if max_model_len is None:
+ max_model_len = derived_max_model_len
+ elif max_model_len > derived_max_model_len:
+ raise ValueError(f"User-specified max_model_len ({max_model_len}) is greater than "
+ f"the derived max_model_len ({max_len_key}={derived_max_model_len}"
+ " in model's config.json). This may lead to incorrect model "
+ "outputs or CUDA errors. Make sure the value is correct and "
+ "within the model context size.")
+ return int(max_model_len)
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/llm.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d2475998ca2658b14d4a572e10cbfc96cfc3d35
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/llm.py
@@ -0,0 +1,275 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
+
+from typing import Dict, List, Optional, Tuple, Union
+
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import PretrainedConfig
+import torch.nn as nn
+from .arg_utils import EngineArgs
+from .llm_engine_sp import LLMEngine
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils import Counter
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from verl.workers.rollout.tokenizer import HybridEngineBaseTokenizer
+
+
+class LLM:
+ """An LLM for generating texts from given prompts and sampling parameters.
+
+ This class includes a tokenizer, a language model (possibly distributed
+ across multiple GPUs), and GPU memory space allocated for intermediate
+ states (aka KV cache). Given a batch of prompts and sampling parameters,
+ this class generates texts from the model, using an intelligent batching
+ mechanism and efficient memory management.
+
+ NOTE: This class is intended to be used for offline inference. For online
+ serving, use the `AsyncLLMEngine` class instead.
+ NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+
+ Args:
+ model: A HuggingFace Transformers model instance.
+ tokenizer: A HuggingFace Transformers tokenizer instance.
+ tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+ if available, and "slow" will always use the slow tokenizer.
+ trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+ downloading the model and tokenizer.
+ tensor_parallel_size: The number of GPUs to use for distributed
+ execution with tensor parallelism.
+ dtype: The data type for the model weights and activations. Currently,
+ we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+ the `torch_dtype` attribute specified in the model config file.
+ However, if the `torch_dtype` in the config is `float32`, we will
+ use `float16` instead.
+ quantization: The method used to quantize the model weights. Currently,
+ we support "awq". If None, we assume the model weights are not
+ quantized and use `dtype` to determine the data type of the weights.
+ revision: The specific model version to use. It can be a branch name,
+ a tag name, or a commit id.
+ tokenizer_revision: The specific tokenizer version to use. It can be a
+ branch name, a tag name, or a commit id.
+ seed: The seed to initialize the random number generator for sampling.
+ gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+ reserve for the model weights, activations, and KV cache. Higher
+ values will increase the KV cache size and thus improve the model's
+ throughput. However, if the value is too high, it may cause out-of-
+ memory (OOM) errors.
+ swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+ This can be used for temporarily storing the states of the requests
+ when their `best_of` sampling parameters are larger than 1. If all
+ requests will have `best_of=1`, you can safely set this to 0.
+ Otherwise, too small values may cause out-of-memory (OOM) errors.
+ enforce_eager: Whether to enforce eager execution. If True, we will
+ disable CUDA graph and always execute the model in eager mode.
+ If False, we will use CUDA graph and eager execution in hybrid.
+ max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode.
+ disable_custom_all_reduce: See ParallelConfig
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
+ model_hf_config: PretrainedConfig,
+ tokenizer_mode: str = "auto",
+ trust_remote_code: bool = False,
+ tensor_parallel_size: int = 1,
+ dtype: str = "auto",
+ quantization: Optional[str] = None,
+ revision: Optional[str] = None,
+ tokenizer_revision: Optional[str] = None,
+ seed: int = 0,
+ gpu_memory_utilization: float = 0.9,
+ swap_space: int = 4,
+ enforce_eager: bool = False,
+ max_context_len_to_capture: int = 8192,
+ disable_custom_all_reduce: bool = False,
+ **kwargs,
+ ) -> None:
+ if "disable_log_stats" not in kwargs:
+ kwargs["disable_log_stats"] = True
+ engine_args = EngineArgs(
+ model_hf_config=model_hf_config,
+ tensor_parallel_size=tensor_parallel_size,
+ dtype=dtype,
+ quantization=quantization,
+ revision=revision,
+ tokenizer_revision=tokenizer_revision,
+ seed=seed,
+ gpu_memory_utilization=gpu_memory_utilization,
+ swap_space=swap_space,
+ enforce_eager=enforce_eager,
+ max_context_len_to_capture=max_context_len_to_capture,
+ disable_custom_all_reduce=disable_custom_all_reduce,
+ **kwargs,
+ )
+ tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
+ if not isinstance(tokenizer, tokenizer_cls):
+ raise ValueError(
+ f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
+ "one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
+ )
+ self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args)
+ self.request_counter = Counter()
+
+ def init_cache_engine(self):
+ self.llm_engine.init_cache_engine()
+
+ def free_cache_engine(self):
+ self.llm_engine.free_cache_engine()
+
+ def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+ return self.llm_engine.tokenizer
+
+ def set_tokenizer(
+ self,
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+ ) -> None:
+ self.llm_engine.tokenizer = tokenizer
+
+ def generate(
+ self,
+ prompts: Optional[Union[str, List[str]]] = None,
+ sampling_params: Optional[SamplingParams] = None,
+ prompt_token_ids: Optional[List[List[int]]] = None,
+ prefix_pos: Optional[Union[int, List[int]]] = None,
+ use_tqdm: bool = True,
+ lora_request: Optional[LoRARequest] = None,
+ ) -> List[RequestOutput]:
+ """Generates the completions for the input prompts.
+
+ NOTE: This class automatically batches the given prompts, considering
+ the memory constraint. For the best performance, put all of your prompts
+ into a single list and pass it to this method.
+
+ Args:
+ prompts: A list of prompts to generate completions for.
+ sampling_params: The sampling parameters for text generation. If
+ None, we use the default sampling parameters.
+ prompt_token_ids: A list of token IDs for the prompts. If None, we
+ use the tokenizer to convert the prompts to token IDs.
+ use_tqdm: Whether to use tqdm to display the progress bar.
+
+ Returns:
+ A list of `RequestOutput` objects containing the generated
+ completions in the same order as the input prompts.
+ """
+ if prompts is None and prompt_token_ids is None:
+ raise ValueError("Either prompts or prompt_token_ids must be "
+ "provided.")
+ if isinstance(prompts, str):
+ # Convert a single prompt to a list.
+ prompts = [prompts]
+ if prompts is not None and prompt_token_ids is not None:
+ if len(prompts) != len(prompt_token_ids):
+ raise ValueError("The lengths of prompts and prompt_token_ids "
+ "must be the same.")
+ if sampling_params is None:
+ # Use default sampling params.
+ sampling_params = SamplingParams()
+
+ # Add requests to the engine.
+ num_requests = len(prompts) if prompts is not None else len(prompt_token_ids)
+ for i in range(num_requests):
+ prompt = prompts[i] if prompts is not None else None
+ prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None
+ token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
+ if not isinstance(token_ids, list):
+ # NOTE(shengguangming): convert the rollout input into List[str]
+ token_ids = self._pre_process_inputs(token_ids)
+ self._add_request(prompt, sampling_params, token_ids, lora_request=lora_request, prefix_pos=prefix_pos_i)
+ return self._run_engine(use_tqdm)
+
+ def _add_request(
+ self,
+ prompt: Optional[str],
+ sampling_params: SamplingParams,
+ prompt_token_ids: Optional[List[int]],
+ lora_request: Optional[LoRARequest] = None,
+ prefix_pos: Optional[int] = None,
+ ) -> None:
+ request_id = str(next(self.request_counter))
+ self.llm_engine.add_request(request_id,
+ prompt,
+ sampling_params,
+ prompt_token_ids,
+ lora_request=lora_request,
+ prefix_pos=prefix_pos)
+
+ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
+ # Initialize tqdm.
+ if use_tqdm:
+ num_requests = self.llm_engine.get_num_unfinished_requests()
+ pbar = tqdm(total=num_requests, desc="Processed prompts")
+ # Run the engine.
+ outputs: List[RequestOutput] = []
+ while self.llm_engine.has_unfinished_requests():
+ step_outputs = self.llm_engine.step()
+ for output in step_outputs:
+ if output.finished:
+ outputs.append(output)
+ if use_tqdm:
+ pbar.update(1)
+ if use_tqdm:
+ pbar.close()
+ # Sort the outputs by request ID.
+ # This is necessary because some requests may be finished earlier than
+ # its previous requests.
+ outputs = sorted(outputs, key=lambda x: int(x.request_id))
+ # TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
+ return self._post_process_outputs(outputs)
+
+ # NOTE(shengguangming): add for verl
+ # TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
+ def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
+ # remove the left padding in the prompt token_id
+ pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+ token_ids = prompt_token_ids[non_pad_index:].tolist()
+ return token_ids
+
+ # NOTE(shengguangming): add for verl
+ def _post_process_outputs(self, outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
+ output_token_ids = []
+ logprobs = []
+ for output in outputs: # List[RequestOutput]
+ output = output.outputs
+ for output in output: # List[CompletionOutput], usually len == 1
+ output_token_ids.append(torch.tensor(output.token_ids))
+ # TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
+ logprobs_dicts = output.logprobs
+ if logprobs_dicts is not None:
+ logprob = []
+ for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
+ logprob.append(logprobs_dict[id])
+ logprobs.append(torch.tensor(logprob))
+
+ pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
+ if len(logprobs) > 0:
+ logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
+ return output_token_ids, logprobs
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor]) -> None:
+ self.llm_engine.sync_model_weights(actor_weights=actor_weights)
+
+ def offload_model_weights(self) -> None:
+ self.llm_engine.offload_model_weights()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e264a8585bc2bb8c5b64efe339af7c9d02475614
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
@@ -0,0 +1,765 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
+
+import os
+import socket
+import time
+import torch
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
+
+from vllm.lora.request import LoRARequest
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.core.scheduler import Scheduler, SchedulerOutputs
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup, SequenceGroupMetadata, SequenceGroupOutput,
+ SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.tokenizer import detokenize_incrementally
+from vllm.engine.metrics import StatLogger, Stats
+from vllm.utils import Counter
+import torch.nn as nn
+from .arg_utils import EngineArgs
+from .tokenizer import TokenizerGroup
+
+logger = init_logger(__name__)
+_LOCAL_LOGGING_INTERVAL_SEC = 5
+
+
+class LLMEngine:
+ """An LLM engine that receives requests and generates texts.
+
+ This is the main class for the vLLM engine. It receives requests
+ from clients and generates texts from the LLM. It includes a tokenizer, a
+ language model (possibly distributed across multiple GPUs), and GPU memory
+ space allocated for intermediate states (aka KV cache). This class utilizes
+ iteration-level scheduling and efficient memory management to maximize the
+ serving throughput.
+
+ The `LLM` class wraps this class for offline batched inference and the
+ `AsyncLLMEngine` class wraps this class for online serving.
+
+ NOTE: The config arguments are derived from the `EngineArgs` class. For the
+ comprehensive list of arguments, see `EngineArgs`.
+
+ Args:
+ model_config: The configuration related to the LLM model.
+ cache_config: The configuration related to the KV cache memory
+ management.
+ parallel_config: The configuration related to distributed execution.
+ scheduler_config: The configuration related to the request scheduler.
+ distributed_init_method: The initialization method for distributed
+ execution. See `torch.distributed.init_process_group` for details.
+ placement_group: Ray placement group for distributed execution.
+ Required for distributed execution.
+ log_stats: Whether to log statistics.
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: nn.Module,
+ model_config: ModelConfig,
+ cache_config: CacheConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ distributed_init_method: str,
+ placement_group: Optional[None],
+ log_stats: bool,
+ ) -> None:
+ logger.info("Initializing an LLM engine with config: "
+ f"model={model_config.model!r}, "
+ f"tokenizer={model_config.tokenizer!r}, "
+ # f"tokenizer_mode={model_config.tokenizer_mode}, "
+ f"revision={model_config.revision}, "
+ f"tokenizer_revision={model_config.tokenizer_revision}, "
+ # f"trust_remote_code={model_config.trust_remote_code}, "
+ f"dtype={model_config.dtype}, "
+ f"max_seq_len={model_config.max_model_len}, "
+ # f"download_dir={model_config.download_dir!r}, "
+ # f"load_format={model_config.load_format}, "
+ f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
+ f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
+ f"quantization={model_config.quantization}, "
+ f"seed={model_config.seed})")
+ # TODO(woosuk): Print more configs in debug mode.
+
+ self.model_config = model_config # TODO: currently is hfconfig
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ assert self.cache_config.sliding_window == getattr(self.model_config.hf_config, "sliding_window", None)
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.log_stats = log_stats
+ self._verify_args()
+
+ # self.model = model # should not store the model, it should be deleted
+ # TODO(shengguangming): maybe we can choose init here or from arguments
+ self._init_tokenizer(tokenizer)
+
+ self.seq_counter = Counter()
+
+ # Create the parallel GPU workers.
+ self._init_workers_sp(model, distributed_init_method)
+
+ # Profile the memory usage and initialize the cache.
+ self._init_cache_sp()
+
+ # Create the scheduler.
+ # NOTE(shengguangming): each process will have independent scheduler
+ self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+
+ # Metric Logging.
+ if self.log_stats:
+ self.stat_logger = StatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC)
+
+ # Logging.
+ self.last_logging_time = 0.0
+ # List of (timestamp, num_tokens)
+ self.num_prompt_tokens: List[Tuple[float, int]] = []
+ # List of (timestamp, num_tokens)
+ self.num_generation_tokens: List[Tuple[float, int]] = []
+
+ def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
+ init_kwargs = dict(enable_lora=bool(self.lora_config),
+ max_num_seqs=self.scheduler_config.max_num_seqs,
+ max_input_length=None)
+ init_kwargs.update(tokenizer_init_kwargs)
+ self.tokenizer: TokenizerGroup = TokenizerGroup(tokenizer, **init_kwargs)
+
+ # TODO: check get_lora_tokenizer func
+ def get_tokenizer_for_seq(self, sequence: Sequence):
+ return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
+
+ def _init_workers_sp(self, model, distributed_init_method: str):
+ # Lazy import the Worker to avoid importing torch.cuda/xformers
+ # before CUDA_VISIBLE_DEVICES is set in the Worker
+ from .worker import Worker # pylint: disable=import-outside-toplevel
+
+ rank = int(os.getenv("RANK"))
+
+ self.worker = Worker(
+ model,
+ self.model_config,
+ self.parallel_config,
+ self.scheduler_config,
+ self.device_config,
+ rank,
+ distributed_init_method,
+ lora_config=self.lora_config,
+ kv_cache_dtype=self.cache_config.cache_dtype,
+ )
+
+ # NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
+ self.worker.init_model()
+ self.worker.load_model()
+
+ def _verify_args(self) -> None:
+ self.model_config.verify_with_parallel_config(self.parallel_config)
+ self.cache_config.verify_with_parallel_config(self.parallel_config)
+
+ def _init_cache_sp(self) -> None:
+ """Profiles the memory usage and initializes the KV cache."""
+ # Get the maximum number of blocks that can be allocated on GPU and CPU.
+ num_blocks = self.worker.profile_num_available_blocks(
+ block_size=self.cache_config.block_size,
+ gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
+ cpu_swap_space=self.cache_config.swap_space_bytes,
+ cache_dtype=self.cache_config.cache_dtype,
+ )
+
+ # NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
+ # have its own scheduler
+ num_gpu_blocks = num_blocks[0]
+ num_cpu_blocks = num_blocks[1]
+
+ # FIXME(woosuk): Change to debug log.
+ logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+ f"# CPU blocks: {num_cpu_blocks}")
+
+ if num_gpu_blocks <= 0:
+ raise ValueError("No available memory for the cache blocks. "
+ "Try increasing `gpu_memory_utilization` when "
+ "initializing the engine.")
+
+ max_seq_len = self.cache_config.block_size * num_gpu_blocks
+ if self.model_config.max_model_len > max_seq_len:
+ raise ValueError(f"The model's max seq len ({self.model_config.max_model_len}) "
+ "is larger than the maximum number of tokens that can be "
+ f"stored in KV cache ({max_seq_len}). Try increasing "
+ "`gpu_memory_utilization` or decreasing `max_model_len` when "
+ "initializing the engine.")
+
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+ # Initialize the cache.
+ self.worker.init_cache_engine(cache_config=self.cache_config)
+ self.worker.warm_up_model()
+
+ def init_cache_engine(self):
+ self.worker.init_cache_engine(cache_config=self.cache_config)
+
+ def free_cache_engine(self):
+ self.worker.free_cache_engine()
+
+ @classmethod
+ def from_engine_args(cls, model, tokenizer, engine_args: EngineArgs) -> "LLMEngine":
+ """Creates an LLM engine from the engine arguments."""
+ # Create the engine configs.
+ engine_configs = engine_args.create_engine_configs()
+ parallel_config = engine_configs[2]
+ # Initialize the cluster.
+ distributed_init_method, placement_group = initialize_cluster(parallel_config)
+ # Create the LLM engine.
+ engine = cls(model,
+ tokenizer,
+ *engine_configs,
+ distributed_init_method,
+ placement_group,
+ log_stats=not engine_args.disable_log_stats)
+ return engine
+
+ def add_request(
+ self,
+ request_id: str,
+ prompt: Optional[str],
+ sampling_params: SamplingParams,
+ prompt_token_ids: Optional[List[int]] = None,
+ arrival_time: Optional[float] = None,
+ lora_request: Optional[LoRARequest] = None,
+ prefix_pos: Optional[int] = None,
+ ) -> None:
+ """Add a request to the engine's request pool.
+
+ The request is added to the request pool and will be processed by the
+ scheduler as `engine.step()` is called. The exact scheduling policy is
+ determined by the scheduler.
+
+ Args:
+ request_id: The unique ID of the request.
+ prompt: The prompt string. Can be None if prompt_token_ids is
+ provided.
+ sampling_params: The sampling parameters for text generation.
+ prompt_token_ids: The token IDs of the prompt. If None, we
+ use the tokenizer to convert the prompts to token IDs.
+ arrival_time: The arrival time of the request. If None, we use
+ the current monotonic time.
+ prefix_pos: If not None, we use the given position as the prefix
+ position for each prompt. We will cache the prefix's KV
+ cache and reuse it for the next request with the same prefix.
+ This is an experimental feature, and may be replaced with
+ automatic prefix caching in the future.
+
+ Details:
+ - Set arrival_time to the current time if it is None.
+ - Set prompt_token_ids to the encoded prompt if it is None.
+ - Create `best_of` number of :class:`~vllm.Sequence` objects.
+ - Create a :class:`~vllm.SequenceGroup` object
+ from the list of :class:`~vllm.Sequence`.
+ - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
+
+ Example:
+ >>> # initialize engine
+ >>> engine = LLMEngine.from_engine_args(engine_args)
+ >>> # set request arguments
+ >>> example_prompt = "Who is the president of the United States?"
+ >>> sampling_params = SamplingParams(temperature=0.0)
+ >>> request_id = 0
+ >>>
+ >>> # add the request to the engine
+ >>> engine.add_request(
+ >>> str(request_id),
+ >>> example_prompt,
+ >>> SamplingParams(temperature=0.0))
+ >>> # continue the request processing
+ >>> ...
+ """
+ if lora_request is not None and not self.lora_config:
+ raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+ "not enabled!")
+ if arrival_time is None:
+ arrival_time = time.monotonic()
+ if prompt_token_ids is None:
+ assert prompt is not None
+ prompt_token_ids = self.tokenizer.encode(prompt)
+
+ # Create the sequences.
+ block_size = self.cache_config.block_size
+ seq_id = next(self.seq_counter)
+ seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, lora_request)
+
+ # Check whether the input specifies prefix
+ prefix = self.scheduler.prefix_pool.add_or_get_prefix(prompt_token_ids[:prefix_pos], lora_request.lora_int_id if
+ lora_request else 0) if prefix_pos is not None else None
+
+ # Create the sequence group.
+ seq_group = SequenceGroup(request_id, [seq], sampling_params, arrival_time, lora_request, prefix)
+
+ # Add the sequence group to the scheduler.
+ self.scheduler.add_seq_group(seq_group)
+
+ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
+ """Aborts a request(s) with the given ID.
+
+ Args:
+ request_id: The ID(s) of the request to abort.
+
+ Details:
+ - Refer to the
+ :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
+ from class :class:`~vllm.core.scheduler.Scheduler`.
+
+ Example:
+ >>> # initialize engine and add a request with request_id
+ >>> request_id = str(0)
+ >>> # abort the request
+ >>> engine.abort_request(request_id)
+ """
+ self.scheduler.abort_seq_group(request_id)
+
+ def get_model_config(self) -> ModelConfig:
+ """Gets the model configuration."""
+ return self.model_config
+
+ def get_num_unfinished_requests(self) -> int:
+ """Gets the number of unfinished requests."""
+ return self.scheduler.get_num_unfinished_seq_groups()
+
+ def has_unfinished_requests(self) -> bool:
+ """Returns True if there are unfinished requests."""
+ return self.scheduler.has_unfinished_seqs()
+
+ def _check_beam_search_early_stopping(
+ self,
+ early_stopping: Union[bool, str],
+ sampling_params: SamplingParams,
+ best_running_seq: Sequence,
+ current_worst_seq: Sequence,
+ ) -> bool:
+ assert sampling_params.use_beam_search
+ length_penalty = sampling_params.length_penalty
+ if early_stopping is True:
+ return True
+
+ current_worst_score = (current_worst_seq.get_beam_search_score(
+ length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(current_worst_seq).eos_token_id))
+ if early_stopping is False:
+ highest_attainable_score = (best_running_seq.get_beam_search_score(
+ length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id))
+ else:
+ assert early_stopping == "never"
+ if length_penalty > 0.0:
+ # If length_penalty > 0.0, beam search will prefer longer
+ # sequences. The highest attainable score calculation is
+ # based on the longest possible sequence length in this case.
+ max_possible_length = max(best_running_seq.get_prompt_len() + sampling_params.max_tokens,
+ self.scheduler_config.max_model_len)
+ highest_attainable_score = (best_running_seq.get_beam_search_score(
+ length_penalty=length_penalty,
+ eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id,
+ seq_len=max_possible_length))
+ else:
+ # Otherwise, beam search will prefer shorter sequences. The
+ # highest attainable score calculation is based on the current
+ # sequence length.
+ highest_attainable_score = (best_running_seq.get_beam_search_score(
+ length_penalty=length_penalty,
+ eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id))
+
+ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput) -> None:
+
+ # Process prompt logprobs
+ prompt_logprobs = outputs.prompt_logprobs
+ if prompt_logprobs is not None:
+ seq_group.prompt_logprobs = prompt_logprobs
+
+ # Process samples
+ samples = outputs.samples
+ parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+ existing_finished_seqs = seq_group.get_finished_seqs()
+ parent_child_dict = {parent_seq.seq_id: [] for parent_seq in parent_seqs}
+ for sample in samples:
+ parent_child_dict[sample.parent_seq_id].append(sample)
+ # List of (child, parent)
+ child_seqs: List[Tuple[Sequence, Sequence]] = []
+
+ # Process the child samples for each parent sequence
+ for parent in parent_seqs:
+ child_samples: List[SequenceOutput] = parent_child_dict[parent.seq_id]
+ if len(child_samples) == 0:
+ # This parent sequence has no children samples. Remove
+ # the parent sequence from the sequence group since it will
+ # not be used in the future iterations.
+ parent.status = SequenceStatus.FINISHED_ABORTED
+ seq_group.remove(parent.seq_id)
+ self.scheduler.free_seq(parent)
+ continue
+ # Fork the parent sequence if there are multiple child samples.
+ for child_sample in child_samples[:-1]:
+ new_child_seq_id = next(self.seq_counter)
+ child = parent.fork(new_child_seq_id)
+ child.append_token_id(child_sample.output_token, child_sample.logprobs)
+ child_seqs.append((child, parent))
+ # Continue the parent sequence for the last child sample.
+ # We reuse the parent sequence here to reduce redundant memory
+ # copies, especially when using non-beam search sampling methods.
+ last_child_sample = child_samples[-1]
+ parent.append_token_id(last_child_sample.output_token, last_child_sample.logprobs)
+ child_seqs.append((parent, parent))
+
+ for seq, _ in child_seqs:
+ # self._decode_sequence(seq, seq_group.sampling_params)
+ self._check_stop(seq, seq_group.sampling_params)
+
+ # Non-beam search case
+ if not seq_group.sampling_params.use_beam_search:
+ # For newly created child sequences, add them to the sequence group
+ # and fork them in block manager if they are not finished.
+ for seq, parent in child_seqs:
+ if seq is not parent:
+ seq_group.add(seq)
+ if not seq.is_finished():
+ self.scheduler.fork_seq(parent, seq)
+
+ # Free the finished and selected parent sequences' memory in block
+ # manager. Keep them in the sequence group as candidate output.
+ # NOTE: we need to fork the new sequences before freeing the
+ # old sequences.
+ for seq, parent in child_seqs:
+ if seq is parent and seq.is_finished():
+ self.scheduler.free_seq(seq)
+ return
+
+ # Beam search case
+ # Select the child sequences to keep in the sequence group.
+ selected_child_seqs = []
+ unselected_child_seqs = []
+ beam_width = seq_group.sampling_params.best_of
+ length_penalty = seq_group.sampling_params.length_penalty
+
+ # Select the newly finished sequences with the highest scores
+ # to replace existing finished sequences.
+ # Tuple of (seq, parent, is_new)
+ existing_finished_seqs = [(seq, None, False) for seq in existing_finished_seqs]
+ new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs if seq.is_finished()]
+ all_finished_seqs = existing_finished_seqs + new_finished_seqs
+ # Sort the finished sequences by their scores.
+ all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+ length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
+ reverse=True)
+ for seq, parent, is_new in all_finished_seqs[:beam_width]:
+ if is_new:
+ # A newly generated child sequence finishes and has a high
+ # score, so we will add it into the sequence group.
+ selected_child_seqs.append((seq, parent))
+ for seq, parent, is_new in all_finished_seqs[beam_width:]:
+ if is_new:
+ # A newly generated child sequence finishes but has a low
+ # score, so we will not add it into the sequence group.
+ # Additionally, if this sequence is a continuation of a
+ # parent sequence, we will need remove the parent sequence
+ # from the sequence group.
+ unselected_child_seqs.append((seq, parent))
+ else:
+ # An existing finished sequence has a low score, so we will
+ # remove it from the sequence group.
+ seq_group.remove(seq.seq_id)
+
+ # select the top beam_width sequences from the running
+ # sequences for the next iteration to continue the beam
+ # search.
+ running_child_seqs = [(seq, parent) for seq, parent in child_seqs if not seq.is_finished()]
+ # Sort the running sequences by their scores.
+ running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+ length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
+ reverse=True)
+
+ # Check if we can stop the beam search.
+ if len(running_child_seqs) == 0:
+ # No running sequences, stop the beam search.
+ stop_beam_search = True
+ elif len(all_finished_seqs) < beam_width:
+ # Not enough finished sequences, continue the beam search.
+ stop_beam_search = False
+ else:
+ # Check the early stopping criteria
+ best_running_seq = running_child_seqs[0][0]
+ current_worst_seq = all_finished_seqs[beam_width - 1][0]
+ stop_beam_search = self._check_beam_search_early_stopping(seq_group.sampling_params.early_stopping,
+ seq_group.sampling_params, best_running_seq,
+ current_worst_seq)
+
+ if stop_beam_search:
+ # Stop the beam search and remove all the running sequences from
+ # the sequence group.
+ unselected_child_seqs.extend(running_child_seqs)
+ else:
+ # Continue the beam search and select the top beam_width sequences
+ # to continue the beam search.
+ selected_child_seqs.extend(running_child_seqs[:beam_width])
+ # The remaining running sequences will not be used in the next
+ # iteration. Again, if these sequences are continuations of
+ # parent sequences, we will need to remove the parent sequences
+ # from the sequence group.
+ unselected_child_seqs.extend(running_child_seqs[beam_width:])
+
+ # For newly created child sequences, add them to the sequence group
+ # and fork them in block manager if they are not finished.
+ for seq, parent in selected_child_seqs:
+ if seq is not parent:
+ seq_group.add(seq)
+ if not seq.is_finished():
+ self.scheduler.fork_seq(parent, seq)
+
+ # Free the finished and selected parent sequences' memory in block
+ # manager. Keep them in the sequence group as candidate output.
+ for seq, parent in selected_child_seqs:
+ if seq is parent and seq.is_finished():
+ self.scheduler.free_seq(seq)
+
+ # Remove the unselected parent sequences from the sequence group and
+ # free their memory in block manager.
+ for seq, parent in unselected_child_seqs:
+ if seq is parent:
+ # Remove the parent sequence if it is not selected for next
+ # iteration
+ seq_group.remove(seq.seq_id)
+ self.scheduler.free_seq(seq)
+
+ def _process_model_outputs(self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
+ # Update the scheduled sequence groups with the model outputs.
+ scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
+ for seq_group, outputs in zip(scheduled_seq_groups, output):
+ self._process_sequence_group_outputs(seq_group, outputs)
+
+ # Free the finished sequence groups.
+ self.scheduler.free_finished_seq_groups()
+
+ # Create the outputs.
+ request_outputs: List[RequestOutput] = []
+ for seq_group in scheduled_seq_groups:
+ request_output = RequestOutput.from_seq_group(seq_group)
+ request_outputs.append(request_output)
+ for seq_group in scheduler_outputs.ignored_seq_groups:
+ request_output = RequestOutput.from_seq_group(seq_group)
+ request_outputs.append(request_output)
+
+ # Update prefix state, now all the uncomputed prefixes are computed.
+ for seq_group in scheduled_seq_groups:
+ if (seq_group.prefix is not None and seq_group.prefix.allocated and not seq_group.prefix.computed):
+ seq_group.prefix.computed = True
+
+ # Log stats.
+ if self.log_stats:
+ self.stat_logger.log(self._get_stats(scheduler_outputs))
+
+ return request_outputs
+
+ def step(self) -> List[RequestOutput]:
+ """Performs one decoding iteration and returns newly generated results.
+
+ This function performs one decoding iteration of the engine. It first
+ schedules the sequences to be executed in the next iteration and the
+ token blocks to be swapped in/out/copy. Then, it executes the model
+ and updates the scheduler with the model outputs. Finally, it decodes
+ the sequences and returns the newly generated results.
+ """
+ seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+ if not scheduler_outputs.is_empty():
+ output = self.worker.execute_model(
+ seq_group_metadata_list=seq_group_metadata_list, # TODO: check this input
+ blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+ blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+ blocks_to_copy=scheduler_outputs.blocks_to_copy,)
+ else:
+ return [RequestOutput.from_seq_group(seq_group) for seq_group in scheduler_outputs.ignored_seq_groups]
+
+ return self._process_model_outputs(output, scheduler_outputs)
+
+ def do_log_stats(self) -> None:
+ """Forced log when no requests active."""
+ if self.log_stats:
+ self.stat_logger.log(self._get_stats(scheduler_outputs=None))
+
+ def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
+ """Get Stats to be Logged to Prometheus."""
+ now = time.monotonic()
+
+ # KV Cache Usage in %.
+ num_total_gpu = self.cache_config.num_gpu_blocks
+ num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
+ gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
+
+ num_total_cpu = self.cache_config.num_cpu_blocks
+ cpu_cache_usage = 0.
+ if num_total_cpu > 0:
+ num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks()
+ cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
+
+ # Scheduler State
+ num_running = len(self.scheduler.running)
+ num_swapped = len(self.scheduler.swapped)
+ num_waiting = len(self.scheduler.waiting)
+
+ # Iteration stats if we have scheduler output.
+ num_prompt_tokens = 0
+ num_generation_tokens = 0
+ time_to_first_tokens = []
+ time_per_output_tokens = []
+ time_e2e_requests = []
+ if scheduler_outputs is not None:
+ prompt_run = scheduler_outputs.prompt_run
+
+ # Number of Tokens.
+ if prompt_run:
+ num_prompt_tokens = scheduler_outputs.num_batched_tokens
+ else:
+ num_generation_tokens = scheduler_outputs.num_batched_tokens
+
+ # Latency Timings.
+ time_last_iters = []
+ for seq_group in scheduler_outputs.scheduled_seq_groups:
+ # Time since last token. (n.b. updates seq_group.last_token_time)
+ time_last_iters.append(seq_group.get_last_latency(now))
+ # Time since arrival for all finished requests.
+ if seq_group.is_finished():
+ time_e2e_requests.append(now - seq_group.arrival_time)
+
+ time_to_first_tokens = time_last_iters if prompt_run else []
+ time_per_output_tokens = [] if prompt_run else time_last_iters
+
+ return Stats(
+ now=now,
+ num_running=num_running,
+ num_swapped=num_swapped,
+ num_waiting=num_waiting,
+ gpu_cache_usage=gpu_cache_usage,
+ cpu_cache_usage=cpu_cache_usage,
+ num_prompt_tokens=num_prompt_tokens,
+ num_generation_tokens=num_generation_tokens,
+ time_to_first_tokens=time_to_first_tokens,
+ time_per_output_tokens=time_per_output_tokens,
+ time_e2e_requests=time_e2e_requests,
+ )
+
+ # TODO: we may not need to decode
+ def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
+ """Decodes the new token for a sequence."""
+ (new_tokens, new_output_text, prefix_offset, read_offset) = detokenize_incrementally(
+ self.get_tokenizer_for_seq(seq),
+ all_input_ids=seq.get_token_ids(),
+ prev_tokens=seq.tokens,
+ prefix_offset=seq.prefix_offset,
+ read_offset=seq.read_offset,
+ skip_special_tokens=prms.skip_special_tokens,
+ spaces_between_special_tokens=prms.spaces_between_special_tokens,
+ )
+ if seq.tokens is None:
+ seq.tokens = new_tokens
+ else:
+ seq.tokens.extend(new_tokens)
+ seq.prefix_offset = prefix_offset
+ seq.read_offset = read_offset
+ seq.output_text += new_output_text
+
+ def _check_stop(self, seq: Sequence, sampling_params: SamplingParams) -> None:
+ """Stop the finished sequences."""
+ # for stop_str in sampling_params.stop:
+ # if seq.output_text.endswith(stop_str):
+ # self._finalize_sequence(seq, sampling_params, stop_str)
+ # seq.status = SequenceStatus.FINISHED_STOPPED
+ # return
+ # if seq.get_last_token_id() in sampling_params.stop_token_ids:
+ # stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(seq.get_last_token_id())
+ # self._finalize_sequence(seq, sampling_params, stop_str)
+ # seq.status = SequenceStatus.FINISHED_STOPPED
+ # return
+
+ # Check if the sequence has reached max_model_len.
+ if seq.get_len() > self.scheduler_config.max_model_len:
+ seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+ return
+
+ # Check if the sequence has reached max_tokens.
+ if seq.get_output_len() == sampling_params.max_tokens:
+ seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+ return
+
+ # Check if the sequence has generated the EOS token.
+ if ((not sampling_params.ignore_eos) and
+ seq.get_last_token_id() == self.get_tokenizer_for_seq(seq).eos_token_id):
+ seq.status = SequenceStatus.FINISHED_STOPPED
+ return
+
+ def _finalize_sequence(self, seq: Sequence, sampling_params: SamplingParams, stop_string: str) -> None:
+ if not sampling_params.include_stop_str_in_output and stop_string:
+ # Truncate the output text so that the stop string is
+ # not included in the output.
+ seq.output_text = seq.output_text[:-len(stop_string)]
+
+ def add_lora(self, lora_request: LoRARequest) -> bool:
+ assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+ return self.worker.add_lora(lora_request)
+
+ def remove_lora(self, lora_id: int) -> bool:
+ assert lora_id > 0, "lora_id must be greater than 0."
+ return self.worker.remove_lora(lora_id)
+
+ def list_loras(self) -> List[int]:
+ return self.worker.list_loras()
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor]) -> None:
+ self.worker.sync_model_weights(actor_weights=actor_weights)
+
+ def offload_model_weights(self) -> None:
+ self.worker.offload_model_weights()
+
+
+def initialize_cluster(
+ parallel_config: ParallelConfig,
+ engine_use_ray: bool = False,
+ ray_address: Optional[str] = None,
+) -> Tuple[str, Optional[None]]:
+ """Initialize the distributed cluster probably with Ray.
+
+ Args:
+ parallel_config: The configurations for parallel execution.
+ engine_use_ray: Whether to use Ray for async engine.
+ ray_address: The address of the Ray cluster. If None, uses
+ the default Ray cluster address.
+
+ Returns:
+ A tuple of (`distributed_init_method`, `placement_group`). The
+ `distributed_init_method` is the address for initializing the
+ distributed backend. `placement_group` includes the specification
+ of the resources for each distributed worker.
+ """
+
+ # Initialize cluster locally.
+ port = get_open_port()
+ # We need to setup the distributed init method to make sure
+ # the distributed megatron code (e.g., get world size) works correctly.
+ distributed_init_method = f"tcp://localhost:{port}"
+ return distributed_init_method, None
+
+
+def get_open_port():
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ s.bind(("", 0))
+ return s.getsockname()[1]
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/model_loader.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..450e2f4b49c22b86b5ce424a303c535fb1596a99
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
@@ -0,0 +1,275 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
+"""Utilities for selecting and loading models."""
+import contextlib
+from typing import Dict, Type, Union
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig, PreTrainedModel
+from megatron.core.tensor_parallel.utils import VocabUtility
+
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.weight_utils import (get_quant_config, initialize_dummy_weights)
+
+from .config import ModelConfig
+from vllm.config import DeviceConfig, LoRAConfig
+from .weight_loaders import *
+from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors
+from vllm.sequence import SamplerOutput
+from typing import Optional
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.sampler import _prune_hidden_states, _apply_logits_processors, _apply_penalties, _apply_top_k_top_p, _apply_min_p, _apply_penalties, _sample, _get_logprobs, _build_sampler_output
+
+
+@contextlib.contextmanager
+def _set_default_torch_dtype(dtype: torch.dtype):
+ """Sets the default torch dtype to the given dtype."""
+ old_dtype = torch.get_default_dtype()
+ torch.set_default_dtype(dtype)
+ yield
+ torch.set_default_dtype(old_dtype)
+
+
+def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+ architectures = getattr(config, "architectures", [])
+ for arch in architectures:
+ model_cls = ModelRegistry.load_model_cls(arch)
+ if model_cls is not None:
+ return model_cls
+ raise ValueError(f"Model architectures {architectures} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+from vllm.model_executor.layers.linear import *
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.activation import ScaledActivation
+
+__LAYER_WEIGHT_LOADER_REGISTRY__ = {
+ ColumnParallelLinear: parallel_weight_loader,
+ MergedColumnParallelLinear: parallel_weight_loader,
+ QKVParallelLinear: parallel_weight_loader,
+ RowParallelLinear: parallel_weight_loader,
+ VocabParallelEmbedding: parallel_weight_loader,
+ ParallelLMHead: parallel_weight_loader
+ # "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
+ # "default_weight_loader": default_weight_loader
+}
+
+# NOTE(gmsheng): change the weight_loader function in runtime
+for layer_class, weight_loader in __LAYER_WEIGHT_LOADER_REGISTRY__.items():
+ layer_class.weight_loader = weight_loader
+
+__MODEL_WEIGHT_LOADER_REGISTRY__ = {
+ 'GPT2LMHeadModel': gpt2_weight_loader,
+ 'LlamaForCausalLM': llama_weight_loader,
+ 'LLaMAForCausalLM': llama_weight_loader,
+ 'MistralForCausalLM': mistral_weight_loader,
+}
+
+# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
+# so we need to rewrite the init function of vocab
+DEFAULT_VOCAB_PADDING_SIZE = 64
+
+
+def vocab_init(self,
+ num_embeddings: int,
+ embedding_dim: int,
+ params_dtype: Optional[torch.dtype] = None,
+ org_num_embeddings: Optional[int] = None,
+ padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
+ super(VocabParallelEmbedding, self).__init__()
+
+ # Keep the input dimensions.
+ # TODO (pad to be divided by 4)
+ self.num_embeddings = num_embeddings
+ self.org_vocab_size = org_num_embeddings or num_embeddings
+
+ # self.num_embeddings_padded = pad_vocab_size(num_embeddings,
+ # padding_size)
+ self.embedding_dim = embedding_dim
+ if params_dtype is None:
+ params_dtype = torch.get_default_dtype()
+ self.tp_size = get_tensor_model_parallel_world_size()
+ # Divide the weight matrix along the vocaburaly dimension.
+
+ self.vocab_start_index, self.vocab_end_index = (VocabUtility.vocab_range_from_global_vocab_size(
+ self.num_embeddings, get_tensor_model_parallel_rank(), self.tp_size))
+ self.num_embeddings_per_partition = (self.vocab_end_index - self.vocab_start_index)
+ self.weight = Parameter(
+ torch.empty(
+ self.num_embeddings_per_partition,
+ self.embedding_dim,
+ # device=torch.cuda.current_device(),
+ dtype=params_dtype))
+ set_weight_attrs(self.weight, {"parallel_dim": 0, "weight_loader": self.weight_loader})
+
+
+VocabParallelEmbedding.__init__ = vocab_init
+
+
+def _get_model_weight_loader(arch: str):
+ if arch in __MODEL_WEIGHT_LOADER_REGISTRY__:
+ return __MODEL_WEIGHT_LOADER_REGISTRY__[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def get_model(actor_model: Union[PreTrainedModel, Dict],
+ model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig] = None) -> nn.Module:
+ model_class = _get_model_architecture(model_config.hf_config)
+
+ # Get the quantization config.
+ linear_method = None
+ quant_config = None
+ if model_config.quantization is not None:
+ quant_config = get_quant_config(model_config.quantization, model_config.model, model_config.hf_config,
+ model_config.download_dir)
+ capability = torch.cuda.get_device_capability()
+ capability = capability[0] * 10 + capability[1]
+ if capability < quant_config.get_min_capability():
+ raise ValueError(f"The quantization method {model_config.quantization} is not "
+ "supported for the current GPU. "
+ f"Minimum capability: {quant_config.get_min_capability()}. "
+ f"Current capability: {capability}.")
+ supported_dtypes = quant_config.get_supported_act_dtypes()
+ if model_config.dtype not in supported_dtypes:
+ raise ValueError(f"{model_config.dtype} is not supported for quantization "
+ f"method {model_config.quantization}. Supported dtypes: "
+ f"{supported_dtypes}")
+ linear_method = quant_config.get_linear_method()
+
+ with _set_default_torch_dtype(model_config.dtype):
+ # Create a model instance.
+ # The weights will be initialized as empty tensors.
+ # with torch.device(device_config.device):
+ # NOTE(sgm): init the model in cpu
+ model = model_class(model_config.hf_config, linear_method)
+
+ if model_config.load_format == "dummy":
+ model = model.cuda()
+ # NOTE(woosuk): For accurate performance evaluation, we assign
+ # random values to the weights.
+ initialize_dummy_weights(model)
+ elif model_config.load_format == 'model' or model_config.load_format == 'auto':
+ # NOTE(shengguangming) Load the weights from the actor model
+ if isinstance(actor_model, nn.Module):
+ load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
+ else:
+ load_weights(actor_weights=actor_model, vllm_model=model)
+
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+# the actor model is .state_dict()
+def load_weights(actor_weights: Dict, vllm_model: nn.Module):
+ weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
+ weight_loader(actor_weights, vllm_model)
+ # NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
+ # after init, and we need this after sync model weights for in first iter.
+ vllm_model = vllm_model.cuda()
+
+
+# FIXME(sgm): hack the Sampler function in vllm v0.3.1
+# as they use ray, the sampler result will only need to return to the driver node,
+# therefore gather is enough. However, we use SPMD instead of a central scheduler,
+# all_gather is required (aligned with v0.2.6)
+def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+ embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
+ # Get the logits for the next tokens.
+ logits = torch.matmul(hidden_states, embedding.t())
+ if embedding_bias is not None:
+ logits += embedding_bias
+ logits = tensor_model_parallel_all_gather(logits)
+ # Remove paddings in vocab (if any).
+ if logits is not None:
+ logits = logits[:, :self.org_vocab_size]
+ return logits
+
+
+def forward(
+ self,
+ embedding: torch.Tensor,
+ hidden_states: torch.Tensor,
+ sampling_metadata: SamplingMetadata,
+ embedding_bias: Optional[torch.Tensor] = None,
+) -> Optional[SamplerOutput]:
+ # Get the hidden states that we use for sampling.
+ hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
+
+ # Get the logits for the next tokens.
+ logits = self._get_logits(hidden_states, embedding, embedding_bias)
+ # save origin logprobs for sampler_output
+ origin_logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+
+ # Only perform sampling in the driver worker.
+ # Note: `_get_logits` is still distributed across TP workers because
+ # the `embedding` weight is distributed across TP workers.
+ # TODO(zhuohan): Change the get_logits part to a separate stage.
+ if not sampling_metadata.perform_sampling:
+ return None
+
+ assert logits is not None
+ _, vocab_size = logits.shape
+
+ # Apply logits processors (if any).
+ logits = _apply_logits_processors(logits, sampling_metadata)
+
+ # Prepare sampling tensors with pinned memory to avoid blocking.
+ (sampling_tensors, do_penalties, do_top_p_top_k,
+ do_min_p) = SamplingTensors.from_sampling_metadata(sampling_metadata, vocab_size, logits.device, logits.dtype)
+
+ # Apply presence and frequency penalties.
+ if do_penalties:
+ logits = _apply_penalties(logits, sampling_tensors.prompt_tokens, sampling_tensors.output_tokens,
+ sampling_tensors.presence_penalties, sampling_tensors.frequency_penalties,
+ sampling_tensors.repetition_penalties)
+
+ # Apply temperature scaling.
+ # Use in-place division to avoid creating a new tensor.
+ logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1))
+
+ if do_top_p_top_k:
+ logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, sampling_tensors.top_ks)
+
+ if do_min_p:
+ logits = _apply_min_p(logits, sampling_tensors.min_ps)
+
+ # We use float32 for probabilities and log probabilities.
+ # Compute the probabilities.
+ probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+ # Compute the log probabilities.
+ # Use log_softmax to ensure numerical stability.
+ logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+
+ # Sample the next tokens.
+ sample_results = _sample(probs, logprobs, sampling_metadata)
+
+ # Get the logprobs query results.
+ # prompt_logprobs, sample_logprobs = _get_logprobs(
+ # logprobs, sampling_metadata, sample_results)
+ prompt_logprobs, sample_logprobs = _get_logprobs(origin_logprobs, sampling_metadata, sample_results)
+
+ return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs)
+
+
+from vllm.model_executor.layers.sampler import Sampler
+
+Sampler._get_logits = _get_logits
+Sampler.forward = forward
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/model_runner.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4acf3422d43c16091977f598111926d636cc3e29
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
@@ -0,0 +1,285 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
+
+from typing import Dict, List, Optional, Tuple, Set, Union
+import contextlib
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig)
+from vllm.logger import init_logger
+from vllm.model_executor import InputMetadata, SamplingMetadata
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.utils import in_wsl
+from vllm.worker.model_runner import ModelRunner, CUDAGraphRunner, _async_h2d
+
+from .model_loader import get_model
+
+logger = init_logger(__name__)
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+_PAD_SLOT_ID = -1
+LORA_WARMUP_RANK = 8
+# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
+# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+
+
+class ModelRunner(ModelRunner):
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ kv_cache_dtype: Optional[str] = "auto",
+ ):
+ self.model_config = model_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.lora_config = lora_config
+
+ # model_config can be None in tests/samplers/test_sampler.py.
+ # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
+ self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None)
+
+ self.device_config = (device_config if device_config is not None else DeviceConfig())
+ self.device = self.device_config.device
+
+ self.model = model # this will be replaced by get_model()
+ self.block_size = None # Set after initial profiling.
+ self.lora_manager = None
+
+ self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+ self.graph_memory_pool = None # Set during graph capture.
+
+ self.max_context_len_to_capture = (self.model_config.max_context_len_to_capture
+ if self.model_config is not None else 0)
+ # When using CUDA graph, the input block tables must be padded to
+ # max_context_len_to_capture. However, creating the block table in
+ # Python can be expensive. To optimize this, we cache the block table
+ # in numpy and only copy the actual input content at every iteration.
+ # The shape of the cached block table will be
+ # (max batch size to capture, max context len to capture / block size).
+ self.graph_block_tables = None # Set after initial profiling.
+ # cache in_wsl result
+ self.in_wsl = in_wsl()
+ self.kv_cache_dtype = kv_cache_dtype
+
+ def load_model(self) -> None:
+ self.model = get_model(actor_model=self.model,
+ model_config=self.model_config,
+ device_config=self.device_config,
+ lora_config=self.lora_config)
+ vocab_size = self.model.config.vocab_size
+
+ if self.lora_config:
+ assert hasattr(
+ self.model,
+ "supported_lora_modules") and self.model.supported_lora_modules, "Model does not support LoRA"
+ assert hasattr(self.model, "embedding_modules"), "Model does not have embedding_modules"
+ assert hasattr(self.model, "embedding_padding_modules"), "Model does not have embedding_padding_modules"
+ self.lora_manager = LRUCacheWorkerLoRAManager(
+ self.scheduler_config.max_num_seqs,
+ self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_paddings, vocab_size,
+ self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules)
+ self.model = self.lora_manager.create_lora_manager(self.model)
+
+ def _prepare_sample(
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ prompt_lens: List[int],
+ subquery_lens: Optional[List[int]],
+ ) -> SamplingMetadata:
+ seq_groups: List[Tuple[List[int], SamplingParams]] = []
+ selected_token_indices: List[int] = []
+ selected_token_start_idx = 0
+ categorized_sample_indices = {t: [] for t in SamplingType}
+ categorized_sample_indices_start_idx = 0
+
+ max_subquery_len = max(subquery_lens) if subquery_lens else 1
+ for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+ seq_ids = list(seq_group_metadata.seq_data.keys())
+ sampling_params = seq_group_metadata.sampling_params
+ seq_groups.append((seq_ids, sampling_params))
+
+ if seq_group_metadata.is_prompt:
+ assert len(seq_ids) == 1
+ assert subquery_lens is not None
+ subquery_len = subquery_lens[i]
+ if sampling_params.prompt_logprobs is not None:
+ # NOTE: prompt token positions do not need sample, skip
+ categorized_sample_indices_start_idx += subquery_len - 1
+
+ categorized_sample_indices[sampling_params.sampling_type].append(categorized_sample_indices_start_idx)
+ categorized_sample_indices_start_idx += 1
+
+ if sampling_params.prompt_logprobs is not None:
+ selected_token_indices.extend(
+ range(selected_token_start_idx, selected_token_start_idx + subquery_len - 1))
+ selected_token_indices.append(selected_token_start_idx + subquery_len - 1)
+ selected_token_start_idx += max_subquery_len
+ else:
+ num_seqs = len(seq_ids)
+ selected_token_indices.extend(range(selected_token_start_idx, selected_token_start_idx + num_seqs))
+ selected_token_start_idx += num_seqs
+
+ categorized_sample_indices[sampling_params.sampling_type].extend(
+ range(categorized_sample_indices_start_idx, categorized_sample_indices_start_idx + num_seqs))
+ categorized_sample_indices_start_idx += num_seqs
+
+ selected_token_indices = _async_h2d(selected_token_indices,
+ dtype=torch.long,
+ target_device=self.device,
+ pin_memory=not self.in_wsl)
+ categorized_sample_indices = {
+ t: _async_h2d(seq_ids, dtype=torch.int, target_device=self.device, pin_memory=not self.in_wsl)
+ for t, seq_ids in categorized_sample_indices.items()
+ }
+
+ seq_data: Dict[int, SequenceData] = {}
+ for seq_group_metadata in seq_group_metadata_list:
+ seq_data.update(seq_group_metadata.seq_data)
+
+ sampling_metadata = SamplingMetadata(
+ seq_groups=seq_groups,
+ seq_data=seq_data,
+ prompt_lens=prompt_lens,
+ selected_token_indices=selected_token_indices,
+ categorized_sample_indices=categorized_sample_indices,
+ )
+ return sampling_metadata
+
+ def prepare_input_tensors(
+ self,
+ seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+ ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata, Set[int], LoRAMapping]:
+ # NOTE: We assume that all sequences in the group are all prompts or
+ # all decodes.
+ is_prompt = seq_group_metadata_list[0].is_prompt
+ # Prepare input tensors.
+ if is_prompt:
+ (input_tokens, input_positions, input_metadata, prompt_lens, subquery_lens, lora_index_mapping,
+ lora_prompt_mapping, lora_requests) = self._prepare_prompt(seq_group_metadata_list)
+ else:
+ (input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping,
+ lora_requests) = self._prepare_decode(seq_group_metadata_list)
+ prompt_lens = []
+ subquery_lens = None
+ sampling_metadata = self._prepare_sample(seq_group_metadata_list, prompt_lens, subquery_lens)
+ if self.lora_config:
+ flat_lora_index_mapping = [item for sublist in lora_index_mapping for item in sublist]
+ lora_mapping = LoRAMapping(
+ flat_lora_index_mapping,
+ lora_prompt_mapping,
+ )
+ else:
+ lora_mapping = None
+
+ return (input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping)
+
+ @torch.inference_mode()
+ def execute_model(
+ self,
+ seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+ kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+ ) -> Optional[SamplerOutput]:
+ (input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests,
+ lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
+
+ if self.lora_config:
+ self.set_active_loras(lora_requests, lora_mapping)
+
+ # Execute the model.
+ if input_metadata.use_cuda_graph:
+ graph_batch_size = input_tokens.shape[0]
+ model_executable = self.graph_runners[graph_batch_size]
+ else:
+ model_executable = self.model
+ hidden_states = model_executable(
+ input_ids=input_tokens,
+ positions=input_positions,
+ kv_caches=kv_caches,
+ input_metadata=input_metadata,
+ )
+
+ # Sample the next token.
+ output = self.model.sample(
+ hidden_states=hidden_states,
+ sampling_metadata=sampling_metadata,
+ )
+ return output
+
+ @torch.inference_mode()
+ def profile_run(self) -> None:
+ # Enable top-k sampling to reflect the accurate memory usage.
+ vocab_size = self.model_config.get_vocab_size()
+ # FIXME(sgm): this sampling params will call cumsum(), causing the
+ # deterministic cumsum throw error
+ sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
+ max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+ max_num_seqs = self.scheduler_config.max_num_seqs
+
+ # This represents the maximum number of different requests
+ # that will have unique loras, an therefore the max amount of memory
+ # consumption create dummy lora request copies from the lora request
+ # passed in, which contains a lora from the lora warmup path.
+ dummy_lora_requests = []
+ dummy_lora_requests_per_seq = []
+ if self.lora_config:
+ for idx in range(self.lora_config.max_loras):
+ lora_id = idx + 1
+ dummy_lora_request = LoRARequest(
+ lora_name=f"warmup_{lora_id}",
+ lora_int_id=lora_id,
+ lora_local_path="/not/a/real/path",
+ )
+ self.lora_manager.add_dummy_lora(dummy_lora_request, rank=LORA_WARMUP_RANK)
+ dummy_lora_requests.append(dummy_lora_request)
+ dummy_lora_requests_per_seq = [
+ dummy_lora_requests[idx % len(dummy_lora_requests)] for idx in range(max_num_seqs)
+ ]
+
+ # Profile memory usage with max_num_sequences sequences and the total
+ # number of tokens equal to max_num_batched_tokens.
+ seqs: List[SequenceGroupMetadata] = []
+ for group_id in range(max_num_seqs):
+ seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs))
+ seq_data = SequenceData([0] * seq_len)
+ seq = SequenceGroupMetadata(
+ request_id=str(group_id),
+ is_prompt=True,
+ seq_data={group_id: seq_data},
+ sampling_params=sampling_params,
+ block_tables=None,
+ lora_request=dummy_lora_requests_per_seq[group_id] if dummy_lora_requests_per_seq else None,
+ )
+ seqs.append(seq)
+
+ # Run the model with the dummy inputs.
+ num_layers = self.model_config.get_num_layers(self.parallel_config)
+ kv_caches = [(None, None)] * num_layers
+ self.execute_model(seqs, kv_caches)
+ torch.cuda.synchronize()
+ return
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b7a45c8d6b8a62efd5100f27a00c399ec4e9e6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
@@ -0,0 +1,147 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""Model and data parallel groups."""
+
+import torch
+import torch.distributed
+
+import vllm.model_executor.parallel_utils.parallel_state as ps
+"""
+This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
+- We assume the Megatron tp+dp+pp world is already established before calling this function.
+
+"""
+
+# Tensor model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+
+# Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp
+# into infer_tp and micro_tp. By default, we use order micro_dp - tp
+_MICRO_DATA_PARALLEL_GROUP = None
+
+
+def initialize_model_parallel_from_megatron(
+ tensor_model_parallel_size=None # we set None for backward compatibility to set infer_tp = train_tp
+) -> None:
+ from megatron.core import parallel_state as mpu
+ from megatron.distributed import new_group
+ # Get world size and rank. Ensure some consistencies.
+ assert torch.distributed.is_initialized()
+
+ if tensor_model_parallel_size is None:
+ tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+ else:
+ assert isinstance(tensor_model_parallel_size, int)
+
+ # Build the tensor model-parallel groups.
+ assert ps._TENSOR_MODEL_PARALLEL_GROUP is None, ("tensor model parallel group is already initialized")
+
+ assert tensor_model_parallel_size <= mpu.get_tensor_model_parallel_world_size(
+ ), 'Not implemented for infer_tp > train_tp'
+
+ global _TENSOR_MODEL_PARALLEL_GROUP
+ global _MICRO_DATA_PARALLEL_GROUP
+
+ assert mpu.get_tensor_model_parallel_world_size() % tensor_model_parallel_size == 0
+
+ micro_dp_size = mpu.get_tensor_model_parallel_world_size() // tensor_model_parallel_size
+
+ world_size: int = torch.distributed.get_world_size()
+
+ num_micro_dp_groups = world_size // micro_dp_size
+
+ rank = torch.distributed.get_rank()
+
+ # Build the micro dp groups.
+ assert _MICRO_DATA_PARALLEL_GROUP is None, ("micro data parallel group is already initialized")
+ for i in range(num_micro_dp_groups):
+ ranks = range(i * micro_dp_size, (i + 1) * micro_dp_size)
+ group = new_group(rank=rank, ranks=ranks, group_type='micro_dp')
+ if rank in ranks:
+ _MICRO_DATA_PARALLEL_GROUP = group
+
+ if tensor_model_parallel_size == mpu.get_tensor_model_parallel_world_size():
+ # using the same tp group as Megatron
+ ps._TENSOR_MODEL_PARALLEL_GROUP = mpu.get_tensor_model_parallel_group()
+
+ _TENSOR_MODEL_PARALLEL_GROUP = mpu.get_tensor_model_parallel_group()
+ # no _MICRO_DATA_PARALLEL_GROUP
+ else:
+ # initialize a micro_dp group and a tp group
+ # assume training tp=4, infer tp=2, then, weight is partitioned as
+ # [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
+
+ # Build the inference tp groups
+ train_tp = mpu.get_tensor_model_parallel_world_size()
+ num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
+ num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+ assert _TENSOR_MODEL_PARALLEL_GROUP is None, ("tensor model parallel group is already initialized")
+ for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
+ start = train_tp * i
+ end = train_tp * (i + 1)
+ for j in range(num_tensor_model_parallel_groups_per_train_tp):
+ ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
+ for i in range(len(ranks)):
+ ranks[i] += j
+ # group = torch.distributed.new_group(ranks)
+ group = new_group(rank=rank, ranks=ranks, group_type='infer_tp')
+ if rank in ranks:
+ _TENSOR_MODEL_PARALLEL_GROUP = group
+ ps._TENSOR_MODEL_PARALLEL_GROUP = _TENSOR_MODEL_PARALLEL_GROUP
+ # Build the pipeline model-parallel groups.
+ # global _PIPELINE_MODEL_PARALLEL_GROUP
+ # global _PIPELINE_GLOBAL_RANKS
+ # assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
+
+ # ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
+ # ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
+
+
+"""
+Tensor model parallel utilities
+"""
+
+
+def get_tensor_model_parallel_group():
+ """Get the tensor model parallel group the caller rank belongs to."""
+ assert _TENSOR_MODEL_PARALLEL_GROUP is not None, ("tensor model parallel group is not initialized")
+ return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_world_size():
+ """Return world size for the tensor model parallel group."""
+ return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_rank():
+ """Return my rank for the tensor model parallel group."""
+ return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_src_rank():
+ """Calculate the global rank corresponding to the first local rank
+ in the tensor model parallel group."""
+ global_rank = torch.distributed.get_rank()
+ local_world_size = get_tensor_model_parallel_world_size()
+ return (global_rank // local_world_size) * local_world_size
+
+
+"""
+Micro Data parallel group
+"""
+
+
+def get_micro_data_parallel_group():
+ assert _MICRO_DATA_PARALLEL_GROUP is not None
+ return _MICRO_DATA_PARALLEL_GROUP
+
+
+def get_micro_data_parallel_world_size():
+ return torch.distributed.get_world_size(group=get_micro_data_parallel_group())
+
+
+def get_micro_data_parallel_rank():
+ return torch.distributed.get_rank(group=get_micro_data_parallel_group())
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8de24afb834af4e5c8d60b006e0696206519315
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+
+from typing import List, Optional, Tuple, Union
+
+from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast)
+
+from vllm.lora.request import LoRARequest
+from vllm.utils import make_async, LRUCache
+from vllm.transformers_utils.tokenizers import *
+
+
+class TokenizerGroup:
+ """A group of tokenizers that can be used for LoRA adapters."""
+
+ def __init__(self, tokenizer: PreTrainedTokenizer, enable_lora: bool, max_num_seqs: int,
+ max_input_length: Optional[int]):
+ self.enable_lora = enable_lora
+ self.max_input_length = max_input_length
+ self.tokenizer = tokenizer
+ if enable_lora:
+ self.lora_tokenizers = LRUCache(capacity=max_num_seqs)
+ else:
+ self.lora_tokenizers = None
+
+ def encode(self,
+ prompt: str,
+ request_id: Optional[str] = None,
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
+ tokenizer = self.get_lora_tokenizer(lora_request)
+ return tokenizer.encode(prompt)
+
+ async def encode_async(self,
+ prompt: str,
+ request_id: Optional[str] = None,
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
+ tokenizer = await self.get_lora_tokenizer_async(lora_request)
+ return tokenizer.encode(prompt)
+
+ def get_lora_tokenizer(self, lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
+ if not lora_request or not self.enable_lora:
+ return self.tokenizer
+ if lora_request.lora_int_id not in self.lora_tokenizers:
+ # TODO(sgm): the lora tokenizer is also passed, but may be different
+ tokenizer = self.tokenizer
+ # tokenizer = (get_lora_tokenizer(
+ # lora_request, **self.tokenizer_config) or self.tokenizer)
+ self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+ return tokenizer
+ else:
+ return self.lora_tokenizers.get(lora_request.lora_int_id)
+
+ # FIXME(sgm): for simplicity, we assign the special token here
+ @property
+ def pad_token_id(self):
+ return self.tokenizer.pad_token_id
+
+ @property
+ def eos_token_id(self):
+ return self.tokenizer.eos_token_id
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..72aa26d06013f5bf29e67bedfcb77fc0af80e1c7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
@@ -0,0 +1,95 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+
+from typing import Dict
+import torch
+import torch.nn as nn
+
+
+# NOTE(shengguangming): replace the origin weight loader function in the class
+def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Parallel Linear weight loader."""
+ assert param.size() == loaded_weight.size(
+ ), 'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'.format(
+ param.size(), loaded_weight.size())
+ assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Default weight loader."""
+ assert param.size() == loaded_weight.size()
+ assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "lm_head.weight" in name:
+ # GPT-2 ties the weights of the embedding layer and the final
+ # linear layer.
+ continue
+ if ".attn.bias" in name or ".attn.masked_bias" in name:
+ # Skip attention mask.
+ # NOTE: "c_attn.bias" should not be skipped.
+ continue
+ if not name.startswith("transformer."):
+ name = "transformer." + name
+ param = params_dict[name]
+ # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+ # Because of this, we need to transpose the weights.
+ # Note(zhuohan): the logic below might break quantized models.
+ for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+ if conv1d_weight_name not in name:
+ continue
+ if not name.endswith(".weight"):
+ continue
+ # TODO: check megatron
+ loaded_weight = loaded_weight.t()
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ prefix = '0.module.module.'
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if name[:len(prefix)] == prefix:
+ name = name[len(prefix):]
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def mistral_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # TODO: need to implement a general way to deal with prefix
+ prefix = '0.module.module.'
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if name[:len(prefix)] == prefix:
+ name = name[len(prefix):]
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/worker.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..50eebd70b86c1160896ded81deeb7e6eedd6d605
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_3_1/worker.py
@@ -0,0 +1,314 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
+"""A GPU worker class."""
+import os
+import gc
+from typing import Dict, List, Tuple, Optional, Union, Set
+
+import torch
+import torch.distributed
+import torch.nn as nn
+
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.model_executor import InputMetadata, set_random_seed
+from vllm.model_executor.parallel_utils.parallel_state import (initialize_model_parallel)
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.worker.cache_engine import CacheEngine
+from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
+from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_group
+
+from .model_runner import ModelRunner
+from .model_loader import load_weights
+from .parallel_state import initialize_model_parallel_from_megatron
+from vllm.lora.request import LoRARequest
+
+
+class Worker:
+ """A worker class that executes (a partition of) the model on a GPU.
+
+ Each worker is associated with a single GPU. The worker is responsible for
+ maintaining the KV cache and executing the model on the GPU. In case of
+ distributed inference, each worker is assigned a partition of the model.
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ rank: Optional[int] = None,
+ distributed_init_method: Optional[str] = None,
+ lora_config: Optional[LoRAConfig] = None,
+ kv_cache_dtype: Optional[str] = "auto",
+ ) -> None:
+ # self.model = model # will be replaced in the init_model
+ self.model_config = model_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.rank = rank
+ self.distributed_init_method = distributed_init_method
+ self.lora_config = lora_config
+
+ self.model_runner = ModelRunner(
+ model,
+ model_config,
+ parallel_config,
+ scheduler_config,
+ device_config,
+ lora_config=self.lora_config,
+ kv_cache_dtype=kv_cache_dtype,
+ )
+
+ # Uninitialized cache engine. Will be initialized by
+ # self.init_cache_engine().
+ self.cache_config = None
+ self.block_size = None
+ self.sliding_window = None
+ self.cache_engine = None
+ self.cache_events = None
+ self.gpu_cache = None
+
+ # For offloading inference engine params
+ self.cpu_model = None
+
+ def init_model(self, cupy_port: Optional[int] = None):
+ # torch.distributed.all_reduce does not free the input tensor until
+ # the synchronization point. This causes the memory usage to grow
+ # as the number of all_reduce calls increases. This env var disables
+ # this behavior.
+ # Related issue:
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+ # Env vars will be set by TORCHRUN.
+ self.rank = self.rank if self.rank is not None else int(os.getenv("RANK", "-1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+ self.device = torch.device(f"cuda:{local_rank}")
+ if self.rank < 0:
+ raise ValueError("Invalid or unspecified rank.")
+ torch.cuda.set_device(self.device)
+
+ _check_if_gpu_supports_dtype(self.model_config.dtype)
+
+ # Initialize the distributed environment.
+ # TODO: do not use cupy
+ _init_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method)
+ if not self.parallel_config.disable_custom_all_reduce:
+ init_custom_ar()
+ # Initialize the model.
+ set_random_seed(self.model_config.seed)
+ # self.model = get_model(actor_model=self.model, model_config=self.model_config)
+
+ def load_model(self):
+ self.model_runner.load_model()
+
+ @torch.inference_mode()
+ def profile_num_available_blocks(
+ self,
+ block_size: int,
+ gpu_memory_utilization: float,
+ cpu_swap_space: int,
+ cache_dtype: str,
+ ) -> Tuple[int, int]:
+ # Profile the memory usage of the model and get the maximum number of
+ # cache blocks that can be allocated with the remaining free memory.
+ torch.cuda.empty_cache()
+ # torch.cuda.reset_peak_memory_stats()
+
+ # Execute a forward pass with dummy inputs to profile the memory usage
+ # of the model.
+ self.model_runner.profile_run()
+
+ # Calculate the number of blocks that can be allocated with the
+ # profiled peak memory.
+ torch.cuda.synchronize()
+ free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+ peak_memory = total_gpu_memory - free_gpu_memory
+
+ cache_block_size = CacheEngine.get_cache_block_size(block_size, cache_dtype, self.model_config,
+ self.parallel_config)
+ # NOTE(sgm) use the remaining memory
+ num_gpu_blocks = int((free_gpu_memory * gpu_memory_utilization) // cache_block_size)
+ # num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size)
+ num_cpu_blocks = int(cpu_swap_space // cache_block_size)
+ num_gpu_blocks = max(num_gpu_blocks, 0)
+ num_cpu_blocks = max(num_cpu_blocks, 0)
+ if self.model_runner.lora_manager:
+ self.model_runner.remove_all_loras()
+ gc.collect()
+ torch.cuda.empty_cache()
+ # Synchronize number of blocks with all the rank
+ num_gpu_blocks = torch.tensor([num_gpu_blocks], device='cuda')
+ num_cpu_blocks = torch.tensor([num_cpu_blocks], device='cuda')
+ torch.distributed.all_reduce(num_gpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group())
+ torch.distributed.all_reduce(num_cpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group())
+ num_gpu_blocks = num_gpu_blocks.item()
+ num_cpu_blocks = num_cpu_blocks.item()
+ return num_gpu_blocks, num_cpu_blocks
+
+ def init_cache_engine(self, cache_config: CacheConfig) -> None:
+ if self.cache_engine is None and self.gpu_cache is None:
+ self.cache_config = cache_config
+ self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config)
+ self.cache_events = self.cache_engine.events
+ self.gpu_cache = self.cache_engine.gpu_cache
+ self.model_runner.set_block_size(self.cache_engine.block_size)
+
+ def free_cache_engine(self):
+ # ensure `enforce_eager=True`
+ self.cache_engine = None
+ self.gpu_cache = None
+
+ def warm_up_model(self) -> None:
+ if not self.model_config.enforce_eager:
+ self.model_runner.capture_model(self.gpu_cache)
+ # Reset the seed to ensure that the random state is not affected by
+ # the model initialization and profiling.
+ set_random_seed(self.model_config.seed)
+
+ def cache_swap(
+ self,
+ blocks_to_swap_in: Dict[int, int],
+ blocks_to_swap_out: Dict[int, int],
+ blocks_to_copy: Dict[int, List[int]],
+ ) -> None:
+ # Issue cache operations.
+ issued_cache_op = False
+ if blocks_to_swap_in:
+ self.cache_engine.swap_in(blocks_to_swap_in)
+ issued_cache_op = True
+ if blocks_to_swap_out:
+ self.cache_engine.swap_out(blocks_to_swap_out)
+ issued_cache_op = True
+ if blocks_to_copy:
+ self.cache_engine.copy(blocks_to_copy)
+ issued_cache_op = True
+
+ cache_events = self.cache_events if issued_cache_op else None
+
+ # Wait for cache operations to finish.
+ # TODO(woosuk): Profile swapping overhead and optimize if needed.
+ if cache_events is not None:
+ for event in cache_events:
+ event.wait()
+
+ @torch.inference_mode()
+ def execute_model(
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ blocks_to_swap_in: Dict[int, int],
+ blocks_to_swap_out: Dict[int, int],
+ blocks_to_copy: Dict[int, List[int]],
+ ) -> SamplerOutput:
+ num_seq_groups = len(seq_group_metadata_list)
+ self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
+
+ # If there is no input, we don't need to execute the model.
+ if num_seq_groups == 0:
+ return {}
+ output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache)
+ return output
+
+ # # Prepare input tensors.
+ # # NOTE(shengguangming): currently we pad in our dataloader and unpad it in pre_process_input, j
+ # # we can just input un-padded sequence for better performance
+ # input_tokens, input_positions, input_metadata = self._prepare_inputs(seq_group_metadata_list)
+
+ # # Execute the model.
+ # output = self.model(
+ # input_ids=input_tokens,
+ # positions=input_positions,
+ # kv_caches=self.gpu_cache,
+ # input_metadata=input_metadata,
+ # cache_events=cache_events,
+ # )
+ # return output
+
+ # assume the input is .state_dict()
+ def sync_model_weights(self, actor_weights: Dict):
+ load_weights(actor_weights, self.model_runner.model)
+
+ def offload_model_weights(self) -> None:
+ if self.cpu_model == None:
+ self.cpu_model = {}
+ for name, params in self.model_runner.model.named_parameters():
+ self.cpu_model[name] = torch.empty_like(params, device='cpu')
+ params.data = self.cpu_model[name]
+ else:
+ for name, params in self.model_runner.model.named_parameters():
+ params.data = self.cpu_model[name]
+
+ def add_lora(self, lora_request: LoRARequest) -> bool:
+ return self.model_runner.add_lora(lora_request)
+
+ def remove_lora(self, lora_id: int) -> bool:
+ return self.model_runner.remove_lora(lora_id)
+
+ def list_loras(self) -> Set[int]:
+ return self.model_runner.list_loras()
+
+
+def _init_distributed_environment(
+ parallel_config: ParallelConfig,
+ rank: int,
+ distributed_init_method: Optional[str] = None,
+) -> None:
+ """Initialize the distributed environment."""
+ if torch.distributed.is_initialized():
+ print('The distributed environment has been initialized before vLLM')
+ elif not distributed_init_method:
+ raise ValueError("distributed_init_method must be set if torch.distributed "
+ "is not already initialized")
+ else:
+ torch.distributed.init_process_group(
+ backend="nccl",
+ world_size=parallel_config.world_size,
+ rank=rank,
+ # init_method=distributed_init_method,
+ )
+
+ # A small all_reduce for warmup.
+ torch.distributed.all_reduce(torch.zeros(1).cuda())
+ # TODO (shengguangming): maybe we should also flag the megatron is initialized
+ if torch.distributed.get_world_size() > 1:
+ initialize_model_parallel_from_megatron(tensor_model_parallel_size=parallel_config.tensor_parallel_size)
+ else:
+ initialize_model_parallel()
+
+
+def _pad_to_alignment(x: List[int], multiple_of: int, pad: int) -> List[int]:
+ return x + [pad] * ((-len(x)) % multiple_of)
+
+
+def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
+ return x + [pad] * (max_len - len(x))
+
+
+def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+ # Check if the GPU supports the dtype.
+ if torch_dtype == torch.bfloat16:
+ compute_capability = torch.cuda.get_device_capability()
+ if compute_capability[0] < 8:
+ gpu_name = torch.cuda.get_device_name()
+ raise ValueError("Bfloat16 is only supported on GPUs with compute capability "
+ f"of at least 8.0. Your {gpu_name} GPU has compute capability "
+ f"{compute_capability[0]}.{compute_capability[1]}.")
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/__init__.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..089bbd748b202ccceb524f91271e7bf91dc9bdfe
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
@@ -0,0 +1,320 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
+
+import os
+import argparse
+import dataclasses
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch.nn as nn
+
+from transformers import PretrainedConfig
+from .config import ModelConfig, LoadConfig
+
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoRAConfig, ParallelConfig,
+ SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import str_to_int_tuple
+
+
+def nullable_str(val: str):
+ if not val or val == "None":
+ return None
+ return val
+
+
+@dataclass
+class EngineArgs:
+ """Arguments for vLLM engine."""
+ model_hf_config: PretrainedConfig = None
+ skip_tokenizer_init: bool = False
+ served_model_name: Optional[Union[str, List[str]]] = None # TODO
+ download_dir: Optional[str] = None
+ load_format: str = 'auto'
+ dtype: str = 'auto'
+ kv_cache_dtype: str = 'auto'
+ quantization_param_path: Optional[str] = None
+ seed: int = 0
+ max_model_len: Optional[int] = None
+ worker_use_ray: bool = False
+ pipeline_parallel_size: int = 1
+ tensor_parallel_size: int = 1
+ max_parallel_loading_workers: Optional[int] = None
+ block_size: int = 16
+ enable_prefix_caching: bool = False
+ use_v2_block_manager: bool = False
+ swap_space: int = 4 # GiB
+ gpu_memory_utilization: float = 0.90
+ max_num_batched_tokens: Optional[int] = None
+ max_num_seqs: int = 256
+ max_logprobs: int = 5 # OpenAI default value
+ disable_log_stats: bool = False
+ revision: Optional[str] = None
+ code_revision: Optional[str] = None
+ tokenizer_revision: Optional[str] = None
+ quantization: Optional[str] = None
+ enforce_eager: bool = False
+ max_context_len_to_capture: Optional[int] = None
+ max_seq_len_to_capture: int = 8192
+ disable_custom_all_reduce: bool = False
+ tokenizer_pool_size: int = 0
+ tokenizer_pool_type: str = "ray"
+ tokenizer_pool_extra_config: Optional[dict] = None
+ enable_lora: bool = False
+ max_loras: int = 1
+ max_lora_rank: int = 16
+ fully_sharded_loras: bool = False
+ lora_extra_vocab_size: int = 256
+ lora_dtype = 'auto'
+ max_cpu_loras: Optional[int] = None
+ device: str = 'auto'
+ ray_workers_use_nsight: bool = False
+ num_gpu_blocks_override: Optional[int] = None
+ num_lookahead_slots: int = 0
+ model_loader_extra_config: Optional[dict] = None
+
+ # Related to Vision-language models such as llava
+ image_input_type: Optional[str] = None
+ image_token_id: Optional[int] = None
+ image_input_shape: Optional[str] = None
+ image_feature_size: Optional[int] = None
+ scheduler_delay_factor: float = 0.0
+ enable_chunked_prefill: bool = False
+
+ guided_decoding_backend: str = 'outlines'
+ # Speculative decoding configuration.
+ speculative_model: Optional[str] = None
+ num_speculative_tokens: Optional[int] = None
+ speculative_max_model_len: Optional[int] = None
+ ngram_prompt_lookup_max: Optional[int] = None
+ ngram_prompt_lookup_min: Optional[int] = None
+
+ @staticmethod
+ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+ """Shared CLI arguments for vLLM engine."""
+ # Model arguments
+ # TODO(shengguangming): delete the unused args
+ parser.add_argument('--model',
+ type=str,
+ default='facebook/opt-125m',
+ help='name or path of the huggingface model to use')
+ parser.add_argument('--tokenizer',
+ type=str,
+ default=EngineArgs.tokenizer,
+ help='name or path of the huggingface tokenizer to use')
+ parser.add_argument('--revision',
+ type=str,
+ default=None,
+ help='the specific model version to use. It can be a branch '
+ 'name, a tag name, or a commit id. If unspecified, will use '
+ 'the default version.')
+ parser.add_argument('--tokenizer-revision',
+ type=str,
+ default=None,
+ help='the specific tokenizer version to use. It can be a branch '
+ 'name, a tag name, or a commit id. If unspecified, will use '
+ 'the default version.')
+ parser.add_argument('--tokenizer-mode',
+ type=str,
+ default=EngineArgs.tokenizer_mode,
+ choices=['auto', 'slow'],
+ help='tokenizer mode. "auto" will use the fast '
+ 'tokenizer if available, and "slow" will '
+ 'always use the slow tokenizer.')
+ parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface')
+ parser.add_argument('--download-dir',
+ type=str,
+ default=EngineArgs.download_dir,
+ help='directory to download and load the weights, '
+ 'default to the default cache dir of '
+ 'huggingface')
+ parser.add_argument('--load-format',
+ type=str,
+ default=EngineArgs.load_format,
+ choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
+ help='The format of the model weights to load. '
+ '"auto" will try to load the weights in the safetensors format '
+ 'and fall back to the pytorch bin format if safetensors format '
+ 'is not available. '
+ '"pt" will load the weights in the pytorch bin format. '
+ '"safetensors" will load the weights in the safetensors format. '
+ '"npcache" will load the weights in pytorch format and store '
+ 'a numpy cache to speed up the loading. '
+ '"dummy" will initialize the weights with random values, '
+ 'which is mainly for profiling.')
+ parser.add_argument('--dtype',
+ type=str,
+ default=EngineArgs.dtype,
+ choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+ help='data type for model weights and activations. '
+ 'The "auto" option will use FP16 precision '
+ 'for FP32 and FP16 models, and BF16 precision '
+ 'for BF16 models.')
+ parser.add_argument('--max-model-len',
+ type=int,
+ default=None,
+ help='model context length. If unspecified, '
+ 'will be automatically derived from the model.')
+ # Parallel arguments
+ parser.add_argument('--worker-use-ray',
+ action='store_true',
+ help='use Ray for distributed serving, will be '
+ 'automatically set when using more than 1 GPU')
+ parser.add_argument('--pipeline-parallel-size',
+ '-pp',
+ type=int,
+ default=EngineArgs.pipeline_parallel_size,
+ help='number of pipeline stages')
+ parser.add_argument('--tensor-parallel-size',
+ '-tp',
+ type=int,
+ default=EngineArgs.tensor_parallel_size,
+ help='number of tensor parallel replicas')
+ # KV cache arguments
+ parser.add_argument('--block-size',
+ type=int,
+ default=EngineArgs.block_size,
+ choices=[8, 16, 32],
+ help='token block size')
+ # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
+ parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed')
+ parser.add_argument('--swap-space',
+ type=int,
+ default=EngineArgs.swap_space,
+ help='CPU swap space size (GiB) per GPU')
+ parser.add_argument('--gpu-memory-utilization',
+ type=float,
+ default=EngineArgs.gpu_memory_utilization,
+ help='the percentage of GPU memory to be used for'
+ 'the model executor')
+ parser.add_argument('--max-num-batched-tokens',
+ type=int,
+ default=EngineArgs.max_num_batched_tokens,
+ help='maximum number of batched tokens per '
+ 'iteration')
+ parser.add_argument('--max-num-seqs',
+ type=int,
+ default=EngineArgs.max_num_seqs,
+ help='maximum number of sequences per iteration')
+ parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics')
+ # Quantization settings.
+ parser.add_argument('--quantization',
+ '-q',
+ type=str,
+ choices=['awq', None],
+ default=None,
+ help='Method used to quantize the weights')
+ return parser
+
+ @classmethod
+ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
+ # Get the list of attributes of this dataclass.
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
+ # Set the attributes from the parsed arguments.
+ engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+ return engine_args
+
+ def create_engine_config(
+ self,
+ ) -> EngineConfig:
+ device_config = DeviceConfig(self.device)
+ # NOTE(sgm): we only modify ModelConfig, other configs are import from vllm
+ model_config = ModelConfig(self.model_hf_config, self.dtype, self.seed, self.revision, self.code_revision,
+ self.tokenizer_revision, self.max_model_len, self.quantization,
+ self.quantization_param_path, self.enforce_eager, self.max_context_len_to_capture,
+ self.max_seq_len_to_capture, self.max_logprobs, self.skip_tokenizer_init,
+ self.served_model_name)
+ cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
+ self.swap_space, self.kv_cache_dtype, self.num_gpu_blocks_override,
+ model_config.get_sliding_window(), self.enable_prefix_caching)
+ parallel_config = ParallelConfig(
+ self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray,
+ self.max_parallel_loading_workers, self.disable_custom_all_reduce,
+ TokenizerPoolConfig.create_config(
+ self.tokenizer_pool_size,
+ self.tokenizer_pool_type,
+ self.tokenizer_pool_extra_config,
+ ), self.ray_workers_use_nsight)
+
+ # Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ parallel_config.world_size = world_size
+
+ # TODO: spec config
+ speculative_config = SpeculativeConfig.maybe_create_spec_config(
+ target_model_config=model_config,
+ target_parallel_config=parallel_config,
+ target_dtype=self.dtype,
+ speculative_model=self.speculative_model,
+ num_speculative_tokens=self.num_speculative_tokens,
+ speculative_max_model_len=self.speculative_max_model_len,
+ enable_chunked_prefill=self.enable_chunked_prefill,
+ use_v2_block_manager=self.use_v2_block_manager,
+ ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
+ ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
+ )
+
+ scheduler_config = SchedulerConfig(
+ self.max_num_batched_tokens,
+ self.max_num_seqs,
+ model_config.max_model_len,
+ self.use_v2_block_manager,
+ num_lookahead_slots=(self.num_lookahead_slots
+ if speculative_config is None else speculative_config.num_lookahead_slots),
+ delay_factor=self.scheduler_delay_factor,
+ enable_chunked_prefill=self.enable_chunked_prefill,
+ )
+
+ lora_config = LoRAConfig(max_lora_rank=self.max_lora_rank,
+ max_loras=self.max_loras,
+ fully_sharded_loras=self.fully_sharded_loras,
+ lora_extra_vocab_size=self.lora_extra_vocab_size,
+ lora_dtype=self.lora_dtype,
+ max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else
+ None) if self.enable_lora else None
+
+ load_config = LoadConfig(
+ load_format=self.load_format,
+ download_dir=self.download_dir,
+ model_loader_extra_config=self.model_loader_extra_config,
+ )
+
+ if self.image_input_type:
+ if (not self.image_token_id or not self.image_input_shape or not self.image_feature_size):
+ raise ValueError('Specify `image_token_id`, `image_input_shape` and '
+ '`image_feature_size` together with `image_input_type`.')
+ vision_language_config = VisionLanguageConfig(
+ image_input_type=VisionLanguageConfig.get_image_input_enum_type(self.image_input_type),
+ image_token_id=self.image_token_id,
+ image_input_shape=str_to_int_tuple(self.image_input_shape),
+ image_feature_size=self.image_feature_size,
+ )
+ else:
+ vision_language_config = None
+
+ decoding_config = DecodingConfig(guided_decoding_backend=self.guided_decoding_backend)
+
+ return EngineConfig(model_config=model_config,
+ cache_config=cache_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ vision_language_config=vision_language_config,
+ speculative_config=speculative_config,
+ load_config=load_config,
+ decoding_config=decoding_config)
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/config.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6af04417b43a2d3672298fcf887b71fc230bb8ae
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/config.py
@@ -0,0 +1,200 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
+
+import enum
+import json
+from typing import List, Optional, Union
+from dataclasses import dataclass, field, fields
+
+from transformers import PretrainedConfig
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import get_quantization_config
+from vllm.transformers_utils.config import get_hf_text_config
+from vllm.utils import is_hip
+# Add for verl
+from vllm.config import ModelConfig, _get_and_verify_dtype, _get_and_verify_max_len
+
+GPTQMarlinConfig = get_quantization_config("gptq_marlin")
+
+logger = init_logger(__name__)
+
+_GB = 1 << 30
+
+
+class ModelConfig(ModelConfig):
+ """Configuration for the model.
+
+ Args:
+ model: Name or path of the huggingface model to use.
+ tokenizer: Name or path of the huggingface tokenizer to use.
+ tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
+ available, and "slow" will always use the slow tokenizer.
+ trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+ downloading the model and tokenizer.
+ download_dir: Directory to download and load the weights, default to the
+ default cache directory of huggingface.
+ load_format: The format of the model weights to load:
+ "auto" will try to load the weights in the safetensors format and
+ fall back to the pytorch bin format if safetensors format is
+ not available.
+ "pt" will load the weights in the pytorch bin format.
+ "safetensors" will load the weights in the safetensors format.
+ "npcache" will load the weights in pytorch format and store
+ a numpy cache to speed up the loading.
+ "dummy" will initialize the weights with random values, which is
+ mainly for profiling.
+ dtype: Data type for model weights and activations. The "auto" option
+ will use FP16 precision for FP32 and FP16 models, and BF16 precision
+ for BF16 models.
+ seed: Random seed for reproducibility.
+ revision: The specific model version to use. It can be a branch name,
+ a tag name, or a commit id. If unspecified, will use the default
+ version.
+ code_revision: The specific revision to use for the model code on
+ Hugging Face Hub. It can be a branch name, a tag name, or a
+ commit id. If unspecified, will use the default version.
+ tokenizer_revision: The specific tokenizer version to use. It can be a
+ branch name, a tag name, or a commit id. If unspecified, will use
+ the default version.
+ max_model_len: Maximum length of a sequence (including prompt and
+ output). If None, will be derived from the model.
+ quantization: Quantization method that was used to quantize the model
+ weights. If None, we assume the model weights are not quantized.
+ quantization_param_path: Path to JSON file containing scaling factors.
+ Used to load KV cache scaling factors into the model when KV cache
+ type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
+ be used to load activation and weight scaling factors when the
+ model dtype is FP8_E4M3 on ROCm.
+ enforce_eager: Whether to enforce eager execution. If True, we will
+ disable CUDA graph and always execute the model in eager mode.
+ If False, we will use CUDA graph and eager execution in hybrid.
+ max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
+ max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode
+ skip_tokenizer_init: If true, skip initialization of tokenizer and
+ detokenizer.
+ served_model_name: The model name used in metrics tag `model_name`,
+ matches the model name exposed via the APIs. If multiple model
+ names provided, the first name will be used. If not specified,
+ the model name will be the same as `model`.
+ """
+
+ def __init__(
+ self,
+ hf_config: PretrainedConfig,
+ dtype: str,
+ seed: int,
+ revision: Optional[str] = None,
+ code_revision: Optional[str] = None,
+ tokenizer_revision: Optional[str] = None,
+ max_model_len: Optional[int] = None,
+ quantization: Optional[str] = None,
+ quantization_param_path: Optional[str] = None,
+ enforce_eager: bool = False,
+ max_context_len_to_capture: Optional[int] = None,
+ max_seq_len_to_capture: Optional[int] = None,
+ max_logprobs: int = 5,
+ skip_tokenizer_init: bool = False,
+ served_model_name: Optional[Union[str, List[str]]] = None,
+ ) -> None:
+ self.model = hf_config._name_or_path
+ self.tokenizer = hf_config._name_or_path
+ self.seed = seed
+ self.revision = revision
+ self.code_revision = code_revision
+ self.tokenizer_revision = tokenizer_revision
+ self.quantization = quantization
+ self.quantization_param_path = quantization_param_path
+ self.enforce_eager = enforce_eager
+ self.max_context_len_to_capture = max_context_len_to_capture
+ if self.max_context_len_to_capture is not None:
+ raise ValueError("`max_context_len_to_capture` is deprecated. "
+ "Use `max_seq_len_to_capture` instead.")
+ self.max_seq_len_to_capture = (max_seq_len_to_capture or max_context_len_to_capture)
+ self.max_logprobs = max_logprobs
+ self.skip_tokenizer_init = skip_tokenizer_init
+
+ # self.hf_config = get_config(model, trust_remote_code, revision)
+ self.hf_config = hf_config
+ self.hf_text_config = get_hf_text_config(hf_config)
+ # TODO: for multimodal model
+ self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
+ self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
+ # self.served_model_name = get_served_model_name(model,
+ # served_model_name)
+ # self._verify_load_format()
+ # self._verify_tokenizer_mode()
+ self._verify_quantization()
+ self._verify_cuda_graph()
+
+
+class LoadFormat(str, enum.Enum):
+ AUTO = 'auto'
+ MEGATRON = "megatron"
+ HF = "hf"
+ DTENSOR = 'dtensor'
+ DUMMY_HF = 'dummy_hf'
+ DUMMY_MEGATRON = 'dummy_megatron'
+ DUMMY_DTENSOR = 'dummy_dtensor'
+
+
+@dataclass
+class LoadConfig:
+ """
+ download_dir: Directory to download and load the weights, default to the
+ default cache directory of huggingface.
+ load_format: The format of the model weights to load:
+ "auto" will try to load the weights in the safetensors format and
+ fall back to the pytorch bin format if safetensors format is
+ not available.
+ "pt" will load the weights in the pytorch bin format.
+ "safetensors" will load the weights in the safetensors format.
+ "npcache" will load the weights in pytorch format and store
+ a numpy cache to speed up the loading.
+ "dummy" will initialize the weights with random values, which is
+ mainly for profiling.
+ "tensorizer" will use CoreWeave's tensorizer library for
+ fast weight loading.
+ """
+
+ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+ download_dir: Optional[str] = None
+ model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
+
+ def __post_init__(self):
+ model_loader_extra_config = self.model_loader_extra_config or {}
+ if isinstance(model_loader_extra_config, str):
+ self.model_loader_extra_config = json.loads(model_loader_extra_config)
+ self._verify_load_format()
+
+ def _verify_load_format(self) -> None:
+ if not isinstance(self.load_format, str):
+ return
+
+ load_format = self.load_format.lower()
+ self.load_format = LoadFormat(load_format)
+
+ rocm_not_supported_load_format: List[str] = []
+ if is_hip() and load_format in rocm_not_supported_load_format:
+ rocm_supported_load_format = [
+ f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format)
+ ]
+ raise ValueError(f"load format '{load_format}' is not supported in ROCm. "
+ f"Supported load formats are "
+ f"{rocm_supported_load_format}")
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..6668b7509161e7d19d4b37f13318a80d59147448
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
@@ -0,0 +1,269 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+
+from typing import Dict, Iterable, Tuple
+import torch
+import torch.nn as nn
+from torch.distributed._tensor import DTensor, Shard, Replicate
+
+from vllm.model_executor.layers.linear import *
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+
+def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ for (param_name, shard_name, shard_id) in stacked_params_mapping:
+ if shard_name not in name:
+ continue
+ stacked_name = name.replace(shard_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if stacked_name.endswith(".bias") and stacked_name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[stacked_name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # lm_head is not used in vllm as it is tied with embed_token.
+ # To prevent errors, skip loading lm_head.weight.
+ if "lm_head.weight" in name:
+ continue
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ # GemmaRMSNorm is different from Llama's in that it multiplies
+ # (1 + weight) to the output, instead of just weight.
+ if "norm.weight" in name:
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+
+ norm_weight = local_loaded_weight + 1.0
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, norm_weight.to(dtype=param.dtype))
+ else:
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def gptbigcode_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "lm_head.weight" in name:
+ continue
+ if ".attn.bias" in name:
+ # Skip attention mask.
+ # NOTE: "c_attn.bias" should not be skipped.
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def starcoder2_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ]
+
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def llama_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ (".qkv_proj", ".q_proj", "q"),
+ (".qkv_proj", ".k_proj", "k"),
+ (".qkv_proj", ".v_proj", "v"),
+ (".gate_up_proj", ".gate_proj", 0),
+ (".gate_up_proj", ".up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if ("rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name):
+ # Models trained using ColossalAI may include these tensors in
+ # the checkpoint. Skip them.
+ continue
+ # With tie_word_embeddings, we can skip lm_head.weight
+ # The weight might appear unnecessarily in the files if the model is
+ # processed with quantization, LoRA, fine-tuning, etc.
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight)
+
+
+def qwen2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def gpt2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ pass
+
+
+def redistribute_dtensor(param_name: str, loaded_weights: DTensor, parallelize_plan: Dict = None):
+ param_name = _process_parameter_names(name=param_name)
+ if parallelize_plan is not None:
+ assert param_name in parallelize_plan.keys(), \
+ f"param name: {param_name} not in parallelize_plan :{parallelize_plan.keys()}"
+ placement = parallelize_plan[param_name]
+ local_loaded_weights = loaded_weights.redistribute(device_mesh=loaded_weights.device_mesh,
+ placements=placement).to_local()
+ else:
+ local_loaded_weights = loaded_weights.full_tensor()
+ return local_loaded_weights
+
+
+def _process_parameter_names(name):
+ # Remove '.weight' if it exists at the end of the string
+ if name.endswith(".weight"):
+ name = name[:-7]
+
+ # Remove 'model.layers.x.' or 'model.' prefix
+ if "model.layers" in name:
+ parts = name.split('.')
+ # Reconstruct the string without 'model.layers.x.'
+ name = '.'.join(parts[3:]) # parts[0] is 'model', parts[1] is 'layers', parts[2] is 'x'
+ elif name.startswith("model."):
+ name = name[6:] # Remove 'model.'
+
+ return name
+
+
+__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__ = {
+ 'GPT2LMHeadModel': gpt2_dtensor_weight_loader,
+ 'LlamaForCausalLM': llama_dtensor_weight_loader,
+ 'LLaMAForCausalLM': llama_dtensor_weight_loader,
+ 'MistralForCausalLM': llama_dtensor_weight_loader, # mistral is the same as llama in vLLM
+ 'InternLMForCausalLM': llama_dtensor_weight_loader,
+ 'AquilaModel': llama_dtensor_weight_loader,
+ 'AquilaForCausalLM': llama_dtensor_weight_loader,
+ 'Phi3ForCausalLM': llama_dtensor_weight_loader,
+ 'GemmaForCausalLM': gemma_dtensor_weight_loader,
+ 'GPTBigCodeForCausalLM': gptbigcode_dtensor_load_weights,
+ 'Starcoder2ForCausalLM': starcoder2_dtensor_load_weights,
+ 'Qwen2ForCausalLM': qwen2_dtensor_weight_loader
+}
+
+
+# the actor model is .state_dict()
+# Load dtensor weights
+def load_dtensor_weights(actor_weights: Dict, vllm_model: nn.Module):
+ weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
+ weight_loader(actor_weights, vllm_model)
+ # NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
+ # after init, and we need this after sync model weights for in first iter.
+ vllm_model = vllm_model.cuda()
+
+
+def _get_model_weight_loader(arch: str):
+ if arch in __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__:
+ return __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__.keys()}")
+
+
+# NOTE(sgm): we use per-parameter weight loader in each vllm sub
+def update_dtensor_weight_loader():
+ pass
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d562e596b8b75ac0a3e81ae651c77cfdc58f3a1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
@@ -0,0 +1,91 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+
+from typing import Dict, Union, Optional, Iterable, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+
+def update_hf_weight_loader():
+ from vllm.model_executor.models.gemma import GemmaForCausalLM
+ GemmaForCausalLM.load_weights = gemma_load_weights
+
+
+def gemma_load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ params_dict = dict(self.named_parameters())
+ loaded_params = set()
+ for name, loaded_weight in weights:
+ for (param_name, shard_name, shard_id) in stacked_params_mapping:
+ if shard_name not in name:
+ continue
+ name = name.replace(shard_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, loaded_weight, shard_id)
+ break
+ else:
+ # lm_head is not used in vllm as it is tied with embed_token.
+ # To prevent errors, skip loading lm_head.weight.
+ if "lm_head.weight" in name:
+ continue
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ # GemmaRMSNorm is different from Llama's in that it multiplies
+ # (1 + weight) to the output, instead of just weight.
+ if "norm.weight" in name:
+ norm_weight = loaded_weight + 1.0 # prevent inplace modify actor weights
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, norm_weight)
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+ loaded_params.add(name)
+ unloaded_params = params_dict.keys() - loaded_params
+ if unloaded_params:
+ raise RuntimeError("Some weights are not initialized from checkpoints: "
+ f"{unloaded_params}")
+
+
+def load_hf_weights(actor_weights: Dict, vllm_model: nn.Module):
+ assert isinstance(actor_weights, Dict)
+ with set_default_torch_dtype(next(vllm_model.parameters()).dtype): # TODO
+ vllm_model.load_weights(actor_weights.items())
+ for _, module in vllm_model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ vllm_model = vllm_model.cuda()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/llm.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..94623a41423e841ae8e388a1d508da5b624a559a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/llm.py
@@ -0,0 +1,306 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
+
+from typing import Dict, List, Optional, Tuple, Union
+
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import PretrainedConfig
+import torch.nn as nn
+from .arg_utils import EngineArgs
+from .llm_engine_sp import LLMEngine
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import MultiModalData
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Counter
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from verl.workers.rollout.tokenizer import HybridEngineBaseTokenizer
+
+
+class LLM:
+ """An LLM for generating texts from given prompts and sampling parameters.
+
+ This class includes a tokenizer, a language model (possibly distributed
+ across multiple GPUs), and GPU memory space allocated for intermediate
+ states (aka KV cache). Given a batch of prompts and sampling parameters,
+ this class generates texts from the model, using an intelligent batching
+ mechanism and efficient memory management.
+
+ NOTE: This class is intended to be used for offline inference. For online
+ serving, use the `AsyncLLMEngine` class instead.
+ NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+
+ Args:
+ model: A HuggingFace Transformers model instance.
+ tokenizer: A HuggingFace Transformers tokenizer instance.
+ tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+ if available, and "slow" will always use the slow tokenizer.
+ trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+ downloading the model and tokenizer.
+ tensor_parallel_size: The number of GPUs to use for distributed
+ execution with tensor parallelism.
+ dtype: The data type for the model weights and activations. Currently,
+ we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+ the `torch_dtype` attribute specified in the model config file.
+ However, if the `torch_dtype` in the config is `float32`, we will
+ use `float16` instead.
+ quantization: The method used to quantize the model weights. Currently,
+ we support "awq". If None, we assume the model weights are not
+ quantized and use `dtype` to determine the data type of the weights.
+ revision: The specific model version to use. It can be a branch name,
+ a tag name, or a commit id.
+ tokenizer_revision: The specific tokenizer version to use. It can be a
+ branch name, a tag name, or a commit id.
+ seed: The seed to initialize the random number generator for sampling.
+ gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+ reserve for the model weights, activations, and KV cache. Higher
+ values will increase the KV cache size and thus improve the model's
+ throughput. However, if the value is too high, it may cause out-of-
+ memory (OOM) errors.
+ swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+ This can be used for temporarily storing the states of the requests
+ when their `best_of` sampling parameters are larger than 1. If all
+ requests will have `best_of=1`, you can safely set this to 0.
+ Otherwise, too small values may cause out-of-memory (OOM) errors.
+ enforce_eager: Whether to enforce eager execution. If True, we will
+ disable CUDA graph and always execute the model in eager mode.
+ If False, we will use CUDA graph and eager execution in hybrid.
+ max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode.
+ disable_custom_all_reduce: See ParallelConfig
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
+ model_hf_config: PretrainedConfig,
+ tokenizer_mode: str = "auto",
+ trust_remote_code: bool = False,
+ tensor_parallel_size: int = 1,
+ dtype: str = "auto",
+ quantization: Optional[str] = None,
+ revision: Optional[str] = None,
+ tokenizer_revision: Optional[str] = None,
+ seed: int = 0,
+ gpu_memory_utilization: float = 0.9,
+ swap_space: int = 4,
+ enforce_eager: bool = False,
+ max_context_len_to_capture: int = None,
+ disable_custom_all_reduce: bool = False,
+ load_format = 'auto',
+ **kwargs,
+ ) -> None:
+ if "disable_log_stats" not in kwargs:
+ kwargs["disable_log_stats"] = True
+ engine_args = EngineArgs(
+ model_hf_config=model_hf_config,
+ tensor_parallel_size=tensor_parallel_size,
+ dtype=dtype,
+ quantization=quantization,
+ revision=revision,
+ tokenizer_revision=tokenizer_revision,
+ seed=seed,
+ gpu_memory_utilization=gpu_memory_utilization,
+ swap_space=swap_space,
+ enforce_eager=enforce_eager,
+ max_context_len_to_capture=max_context_len_to_capture,
+ disable_custom_all_reduce=disable_custom_all_reduce,
+ load_format=load_format,
+ **kwargs,
+ )
+ tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
+ if not isinstance(tokenizer, tokenizer_cls):
+ raise ValueError(
+ f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
+ "one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
+ )
+ self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args)
+ self.request_counter = Counter()
+
+ def init_cache_engine(self):
+ self.llm_engine.init_cache_engine()
+
+ def free_cache_engine(self):
+ self.llm_engine.free_cache_engine()
+
+ def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+ return self.llm_engine.tokenizer
+
+ def set_tokenizer(
+ self,
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+ ) -> None:
+ self.llm_engine.tokenizer = tokenizer
+
+ def generate(
+ self,
+ prompts: Optional[Union[str, List[str]]] = None,
+ sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None,
+ prompt_token_ids: Optional[List[List[int]]] = None,
+ use_tqdm: bool = True,
+ lora_request: Optional[LoRARequest] = None,
+ multi_modal_data: Optional[MultiModalData] = None,
+ ) -> List[RequestOutput]:
+ """Generates the completions for the input prompts.
+
+ NOTE: This class automatically batches the given prompts, considering
+ the memory constraint. For the best performance, put all of your prompts
+ into a single list and pass it to this method.
+
+ Args:
+ prompts: A list of prompts to generate completions for.
+ sampling_params: The sampling parameters for text generation. If
+ None, we use the default sampling parameters.
+ When it is a single value, it is applied to every prompt.
+ When it is a list, the list must have the same length as the
+ prompts and it is paired one by one with the prompt.
+ prompt_token_ids: A list of token IDs for the prompts. If None, we
+ use the tokenizer to convert the prompts to token IDs.
+ use_tqdm: Whether to use tqdm to display the progress bar.
+ lora_request: LoRA request to use for generation, if any.
+ multi_modal_data: Multi modal data.
+
+ Returns:
+ A list of `RequestOutput` objects containing the generated
+ completions in the same order as the input prompts.
+ """
+ if prompts is None and prompt_token_ids is None:
+ raise ValueError("Either prompts or prompt_token_ids must be "
+ "provided.")
+ if self.llm_engine.model_config.skip_tokenizer_init \
+ and prompts is not None:
+ raise ValueError("prompts must be None if skip_tokenizer_init "
+ "is True")
+ if isinstance(prompts, str):
+ # Convert a single prompt to a list.
+ prompts = [prompts]
+ if (prompts is not None and prompt_token_ids is not None and len(prompts) != len(prompt_token_ids)):
+ raise ValueError("The lengths of prompts and prompt_token_ids "
+ "must be the same.")
+
+ if prompts is not None:
+ num_requests = len(prompts)
+ else:
+ assert prompt_token_ids is not None
+ num_requests = len(prompt_token_ids)
+
+ if sampling_params is None:
+ # Use default sampling params.
+ sampling_params = SamplingParams()
+
+ elif isinstance(sampling_params, list) and len(sampling_params) != num_requests:
+ raise ValueError("The lengths of prompts and sampling_params "
+ "must be the same.")
+ if multi_modal_data:
+ multi_modal_data.data = multi_modal_data.data.to(torch.float16)
+
+ # Add requests to the engine.
+ for i in range(num_requests):
+ prompt = prompts[i] if prompts is not None else None
+ token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
+ if not isinstance(token_ids, list):
+ # NOTE(shengguangming): convert the rollout input into List[str]
+ token_ids = self._pre_process_inputs(token_ids)
+ self._add_request(
+ prompt,
+ sampling_params[i] if isinstance(sampling_params, list) else sampling_params,
+ token_ids,
+ lora_request=lora_request,
+ # Get ith image while maintaining the batch dim.
+ multi_modal_data=MultiModalData(type=multi_modal_data.type, data=multi_modal_data.data[i].unsqueeze(0))
+ if multi_modal_data else None,
+ )
+ return self._run_engine(use_tqdm)
+
+ def _add_request(
+ self,
+ prompt: Optional[str],
+ sampling_params: SamplingParams,
+ prompt_token_ids: Optional[List[int]],
+ lora_request: Optional[LoRARequest] = None,
+ multi_modal_data: Optional[MultiModalData] = None,
+ ) -> None:
+ request_id = str(next(self.request_counter))
+ self.llm_engine.add_request(request_id,
+ prompt,
+ sampling_params,
+ prompt_token_ids,
+ lora_request=lora_request,
+ multi_modal_data=multi_modal_data)
+
+ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
+ # Initialize tqdm.
+ if use_tqdm:
+ num_requests = self.llm_engine.get_num_unfinished_requests()
+ pbar = tqdm(total=num_requests, desc="Processed prompts", dynamic_ncols=True)
+ # Run the engine.
+ outputs: List[RequestOutput] = []
+ while self.llm_engine.has_unfinished_requests():
+ step_outputs = self.llm_engine.step()
+ for output in step_outputs:
+ if output.finished:
+ outputs.append(output)
+ if use_tqdm:
+ pbar.update(1)
+ if use_tqdm:
+ pbar.close()
+ # Sort the outputs by request ID.
+ # This is necessary because some requests may be finished earlier than
+ # its previous requests.
+ outputs = sorted(outputs, key=lambda x: int(x.request_id))
+ # TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
+ return self._post_process_outputs(outputs)
+
+ # NOTE(shengguangming): add for verl
+ # TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
+ def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
+ # remove the left padding in the prompt token_id
+ pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+ token_ids = prompt_token_ids[non_pad_index:].tolist()
+ return token_ids
+
+ # NOTE(shengguangming): add for verl
+ def _post_process_outputs(self, request_outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
+ output_token_ids = []
+ logprobs = []
+ for request_output in request_outputs: # List[RequestOutput]
+ outputs = request_output.outputs
+ for output in outputs: # List[CompletionOutput], usually len == 1
+ output_token_ids.append(torch.tensor(output.token_ids))
+ # TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
+ logprobs_dicts = output.logprobs
+ if logprobs_dicts is not None:
+ logprob = []
+ for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
+ logprob.append(logprobs_dict[id].logprob)
+ logprobs.append(torch.tensor(logprob))
+
+ pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
+ if len(logprobs) > 0:
+ logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
+ return output_token_ids, logprobs
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.llm_engine.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+ def offload_model_weights(self) -> None:
+ self.llm_engine.offload_model_weights()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
new file mode 100644
index 0000000000000000000000000000000000000000..75bf11ab319a623daf9d6d57668e17c46c2cc4ec
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
@@ -0,0 +1,283 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
+
+import torch
+from typing import Dict, Optional, Union, Type
+
+import vllm
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig,
+ SpeculativeConfig, VisionLanguageConfig)
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.engine.metrics import StatLogger
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message)
+from vllm.utils import Counter
+from vllm.engine.llm_engine import _load_generation_config_dict
+from vllm.engine.llm_engine import LLMEngine
+
+import torch.nn as nn
+from .arg_utils import EngineArgs
+from .tokenizer import TokenizerGroup
+from .config import ModelConfig, LoadConfig
+
+logger = init_logger(__name__)
+_LOCAL_LOGGING_INTERVAL_SEC = 5
+
+
+class LLMEngine(LLMEngine):
+ """An LLM engine that receives requests and generates texts.
+
+ This is the main class for the vLLM engine. It receives requests
+ from clients and generates texts from the LLM. It includes a tokenizer, a
+ language model (possibly distributed across multiple GPUs), and GPU memory
+ space allocated for intermediate states (aka KV cache). This class utilizes
+ iteration-level scheduling and efficient memory management to maximize the
+ serving throughput.
+
+ The `LLM` class wraps this class for offline batched inference and the
+ `AsyncLLMEngine` class wraps this class for online serving.
+
+ NOTE: The config arguments are derived from the `EngineArgs` class. For the
+ comprehensive list of arguments, see `EngineArgs`.
+
+ Args:
+ model: the actor model initialize outside vllm (add for verl)
+ tokenizer: the initialized tokenizer (add for verl)
+ model_config: The configuration related to the LLM model.
+ cache_config: The configuration related to the KV cache memory
+ management.
+ parallel_config: The configuration related to distributed execution.
+ scheduler_config: The configuration related to the request scheduler.
+ distributed_init_method: The initialization method for distributed
+ execution. See `torch.distributed.init_process_group` for details.
+ placement_group: Ray placement group for distributed execution.
+ Required for distributed execution.
+ log_stats: Whether to log statistics.
+ """
+
+ def __init__(
+ self,
+ # NOTE(sgm): first two arguments are added for verl
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: nn.Module,
+ # NOTE(sgm): vllm original arguments
+ model_config: ModelConfig,
+ cache_config: CacheConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig],
+ speculative_config: Optional[SpeculativeConfig],
+ decoding_config: Optional[DecodingConfig],
+ executor_class: Type[ExecutorBase],
+ log_stats: bool,
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+ ) -> None:
+ logger.info(
+ "Initializing an LLM engine (v%s) with config: "
+ "model=%r, speculative_config=%r, tokenizer=%r, "
+ "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
+ "tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
+ "max_seq_len=%d, download_dir=%r, load_format=%s, "
+ "tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
+ "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
+ "quantization_param_path=%s, device_config=%s, "
+ "decoding_config=%r, seed=%d, served_model_name=%s)",
+ vllm.__version__,
+ model_config.model,
+ speculative_config,
+ model_config.tokenizer,
+ model_config.skip_tokenizer_init,
+ # model_config.tokenizer_mode,
+ model_config.revision,
+ model_config.tokenizer_revision,
+ # model_config.trust_remote_code,
+ model_config.dtype,
+ model_config.max_model_len,
+ load_config.download_dir,
+ load_config.load_format,
+ parallel_config.tensor_parallel_size,
+ parallel_config.disable_custom_all_reduce,
+ model_config.quantization,
+ model_config.enforce_eager,
+ cache_config.cache_dtype,
+ model_config.quantization_param_path,
+ device_config.device,
+ decoding_config,
+ model_config.seed,
+ # model_config.served_model_name,
+ )
+ # TODO(woosuk): Print more configs in debug mode.
+
+ self.model_config = model_config # TODO: currently is hfconfig
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ self.vision_language_config = vision_language_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.speculative_config = speculative_config
+ self.load_config = load_config
+ self.decoding_config = decoding_config or DecodingConfig()
+ self.log_stats = log_stats
+
+ # self.model = model # should not store the model, it should be deleted
+ # TODO(shengguangming): maybe we can choose init here or from arguments
+ if not self.model_config.skip_tokenizer_init:
+ # TODO: check tokenizer class
+ self._init_tokenizer(tokenizer)
+ self.detokenizer = Detokenizer(self.tokenizer)
+ else:
+ self.detokenizer = None
+ self.tokenizer = None
+
+ self.seq_counter = Counter()
+ # TODO: don't know what's the usage
+ self.generation_config_fields = _load_generation_config_dict(model_config)
+
+ self.model_executor = executor_class(
+ model=model, # add for spmd_gpu_executor
+ model_config=model_config,
+ cache_config=cache_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ vision_language_config=vision_language_config,
+ speculative_config=speculative_config,
+ load_config=load_config,
+ )
+
+ # Profile the memory usage and initialize the cache.
+ self._initialize_kv_caches()
+
+ # If usage stat is enabled, collect relevant info.
+ if is_usage_stats_enabled():
+ from vllm.model_executor.model_loader import (get_architecture_class_name)
+ usage_message.report_usage(
+ get_architecture_class_name(model_config),
+ usage_context,
+ extra_kvs={
+ # Common configuration
+ "dtype": str(model_config.dtype),
+ "tensor_parallel_size": parallel_config.tensor_parallel_size,
+ "block_size": cache_config.block_size,
+ "gpu_memory_utilization": cache_config.gpu_memory_utilization,
+
+ # Quantization
+ "quantization": model_config.quantization,
+ "kv_cache_dtype": cache_config.cache_dtype,
+
+ # Feature flags
+ "enable_lora": bool(lora_config),
+ "enable_prefix_caching": cache_config.enable_prefix_caching,
+ "enforce_eager": model_config.enforce_eager,
+ "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
+ })
+
+ if self.tokenizer:
+ # Ping the tokenizer to ensure liveness if it runs in a
+ # different process.
+ self.tokenizer.ping()
+
+ # Create the scheduler.
+ # NOTE: the cache_config here have been updated with the numbers of
+ # GPU and CPU blocks, which are profiled in the distributed executor.
+ # NOTE(shengguangming): each process will have independent scheduler
+ self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+
+ # Metric Logging.
+ if self.log_stats:
+ self.stat_logger = StatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+ labels=dict(model_name=model_config.served_model_name),
+ max_model_len=self.model_config.max_model_len)
+ self.stat_logger.info("cache_config", self.cache_config)
+
+ # Create sequence output processor, e.g. for beam search or
+ # speculative decoding.
+ self.output_processor = (SequenceGroupOutputProcessor.create_output_processor(
+ self.scheduler_config,
+ self.detokenizer,
+ self.scheduler,
+ self.seq_counter,
+ self.get_tokenizer_for_seq,
+ stop_checker=StopChecker(
+ self.scheduler_config.max_model_len,
+ self.get_tokenizer_for_seq,
+ ),
+ ))
+
+ # TODO(sgm): add for verl but we may not tokenizer in Rollout
+ def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
+ init_kwargs = dict(enable_lora=bool(self.lora_config),
+ max_num_seqs=self.scheduler_config.max_num_seqs,
+ max_input_length=None)
+ init_kwargs.update(tokenizer_init_kwargs)
+ self.tokenizer: TokenizerGroup = TokenizerGroup(tokenizer, **init_kwargs)
+
+ def init_cache_engine(self):
+ # TODO: check whether we should rebuild the CUDAGraph every iter when offload/load KVCache
+ # Re-capture CUDAGraph would be time-consuming
+ self.model_executor.init_cache_engine()
+
+ def free_cache_engine(self):
+ self.model_executor.free_cache_engine()
+
+ # NOTE(sgm): currently, we only support GPU executor
+ # The GPUExecutor remove the Ray dependency
+ @classmethod
+ def from_engine_args(
+ cls,
+ model,
+ tokenizer,
+ engine_args: EngineArgs,
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+ ) -> "LLMEngine":
+ """Creates an LLM engine from the engine arguments."""
+ # Create the engine configs.
+ engine_config = engine_args.create_engine_config()
+
+ # Initialize the cluster and specify the executor class.
+ assert engine_config.device_config.device_type == "cuda", \
+ "Currently, the vllm in verl only support running on GPU"
+
+ if engine_config.parallel_config.world_size == 1:
+ engine_config.load_config.load_format = "dummy_hf"
+
+ from .spmd_gpu_executor import SPMDGPUExecutor
+ executor_class = SPMDGPUExecutor
+
+ # Create the LLM engine.
+ engine = cls(
+ model,
+ tokenizer,
+ **engine_config.to_dict(),
+ executor_class=executor_class,
+ log_stats=not engine_args.disable_log_stats,
+ usage_context=usage_context,
+ )
+ return engine
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.model_executor.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+ def offload_model_weights(self) -> None:
+ self.model_executor.offload_model_weights()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a7c2e2cfe57f1bd3a4b49b61d1085a76b0a9b0a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
@@ -0,0 +1,348 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+
+from typing import Dict
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.layers.linear import *
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.activation import ScaledActivation
+from vllm.model_executor.models import ModelRegistry
+
+
+# NOTE(shengguangming): replace the origin weight loader function in the class
+def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Parallel Linear weight loader."""
+ assert param.size() == loaded_weight.size(
+ ), 'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'.format(
+ param.size(), loaded_weight.size())
+ assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Default weight loader."""
+ assert param.size() == loaded_weight.size()
+ assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "lm_head.weight" in name:
+ # GPT-2 ties the weights of the embedding layer and the final
+ # linear layer.
+ continue
+ if ".attn.bias" in name or ".attn.masked_bias" in name:
+ # Skip attention mask.
+ # NOTE: "c_attn.bias" should not be skipped.
+ continue
+ if not name.startswith("transformer."):
+ name = "transformer." + name
+ param = params_dict[name]
+ # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+ # Because of this, we need to transpose the weights.
+ # Note(zhuohan): the logic below might break quantized models.
+ for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+ if conv1d_weight_name not in name:
+ continue
+ if not name.endswith(".weight"):
+ continue
+ # TODO: check megatron
+ loaded_weight = loaded_weight.t()
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+ ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
+ ('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ (
+ 'input_layernorm',
+ 'input_layernorm',
+ ),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def _replace_name(megatron_name, name_mapping):
+ for m_name, v_name in name_mapping:
+ if m_name not in megatron_name:
+ continue
+ if 'layers' in megatron_name: # deal with decoder layers
+ megatron_name = megatron_name.replace('decoder', 'model')
+ megatron_name_list = megatron_name.split('.')
+ if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
+ param_name_list = megatron_name_list[:3]
+ param_name_list.append(v_name)
+ param_name = '.'.join(param_name_list)
+ else:
+ param_name_list = megatron_name_list[:3]
+ weight_or_bias = megatron_name_list[-1]
+ param_name_list.append(v_name)
+ param_name_list.append(weight_or_bias)
+ param_name = '.'.join(param_name_list)
+ return param_name
+ else:
+ param_name = megatron_name.replace(m_name, v_name)
+ return param_name
+
+
+def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+ ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
+ ('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ (
+ 'input_layernorm',
+ 'input_layernorm',
+ ),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def _replace_name(megatron_name, name_mapping):
+ for m_name, v_name in name_mapping:
+ if m_name not in megatron_name:
+ continue
+ if 'layers' in megatron_name: # deal with decoder layers
+ megatron_name = megatron_name.replace('decoder', 'model')
+ megatron_name_list = megatron_name.split('.')
+ if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
+ param_name_list = megatron_name_list[:3]
+ param_name_list.append(v_name)
+ param_name = '.'.join(param_name_list)
+ else:
+ param_name_list = megatron_name_list[:3]
+ weight_or_bias = megatron_name_list[-1]
+ param_name_list.append(v_name)
+ param_name_list.append(weight_or_bias)
+ param_name = '.'.join(param_name_list)
+ return param_name
+ else:
+ param_name = megatron_name.replace(m_name, v_name)
+ return param_name
+
+
+def mistral_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # TODO: need to implement a general way to deal with prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__ = {
+ ColumnParallelLinear: parallel_weight_loader,
+ MergedColumnParallelLinear: parallel_weight_loader,
+ QKVParallelLinear: parallel_weight_loader,
+ RowParallelLinear: parallel_weight_loader,
+ VocabParallelEmbedding: parallel_weight_loader,
+ ParallelLMHead: parallel_weight_loader
+ # "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
+ # "default_weight_loader": default_weight_loader
+}
+
+# for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
+# # setattr(layer_class, 'megatron_weight_loader', weight_loader)
+# layer_class.weight_loader = weight_loader
+
+__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__ = {
+ 'GPT2LMHeadModel': gpt2_weight_loader,
+ 'LlamaForCausalLM': llama_megatron_core_te_weight_loader, # use te backend for open-source megatron
+ 'LLaMAForCausalLM': llama_megatron_core_te_weight_loader,
+ 'MistralForCausalLM': mistral_megatron_weight_loader,
+}
+
+
+# the actor model is .state_dict()
+# Load megatron weights
+def load_megatron_weights(actor_weights: Dict, vllm_model: nn.Module):
+ weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
+ weight_loader(actor_weights, vllm_model)
+ # NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
+ # after init, and we need this after sync model weights for in first iter.
+ vllm_model = vllm_model.cuda()
+
+
+def _get_model_weight_loader(arch: str):
+ if arch in __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__:
+ return __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def update_megatron_weight_loader():
+ for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
+ layer_class.weight_loader = weight_loader
+ VocabParallelEmbedding.__init__ = vocab_init
+
+
+# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
+# so we need to rewrite the init function of vocab
+DEFAULT_VOCAB_PADDING_SIZE = 64
+
+
+def vocab_init(self,
+ num_embeddings: int,
+ embedding_dim: int,
+ params_dtype: Optional[torch.dtype] = None,
+ org_num_embeddings: Optional[int] = None,
+ padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
+ super(VocabParallelEmbedding, self).__init__()
+
+ # Keep the input dimensions.
+ # TODO (pad to be divided by 4)
+ self.num_embeddings = num_embeddings
+ self.org_vocab_size = org_num_embeddings or num_embeddings
+
+ # self.num_embeddings_padded = pad_vocab_size(num_embeddings,
+ # padding_size)
+ self.embedding_dim = embedding_dim
+ if params_dtype is None:
+ params_dtype = torch.get_default_dtype()
+ self.tp_size = get_tensor_model_parallel_world_size()
+ # Divide the weight matrix along the vocaburaly dimension.
+
+ # TODO: remove dependencies from megatron
+ from megatron.core.tensor_parallel.utils import VocabUtility
+ self.vocab_start_index, self.vocab_end_index = (VocabUtility.vocab_range_from_global_vocab_size(
+ self.num_embeddings, get_tensor_model_parallel_rank(), self.tp_size))
+ self.num_embeddings_per_partition = (self.vocab_end_index - self.vocab_start_index)
+ self.weight = Parameter(
+ torch.empty(
+ self.num_embeddings_per_partition,
+ self.embedding_dim,
+ # device=torch.cuda.current_device(),
+ dtype=params_dtype))
+ set_weight_attrs(self.weight, {"parallel_dim": 0, "weight_loader": self.weight_loader})
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/model_loader.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f4013451ff7e4e4612719251f48d5849fdc15d5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
@@ -0,0 +1,265 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
+"""Utilities for selecting and loading models."""
+from typing import Dict, Union, Optional, Iterable, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+
+from vllm.config import (DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.model_executor.model_loader import BaseModelLoader
+from vllm.model_executor.model_loader.loader import _initialize_model
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+
+from .config import ModelConfig, LoadFormat, LoadConfig
+from .megatron_weight_loaders import load_megatron_weights, update_megatron_weight_loader
+from .dtensor_weight_loaders import load_dtensor_weights, update_dtensor_weight_loader
+from .hf_weight_loader import update_hf_weight_loader
+
+
+def get_model(actor_model: Union[PreTrainedModel, Dict], model_config: ModelConfig, load_config: LoadConfig,
+ device_config: DeviceConfig, parallel_config: ParallelConfig, scheduler_config: SchedulerConfig,
+ lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
+ loader = get_model_loader(load_config)
+ if load_config.load_format.startswith('dummy'):
+ return loader.load_model(model_config=model_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ vision_language_config=vision_language_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config)
+ else:
+ return loader.load_model(actor_model=actor_model,
+ model_config=model_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ vision_language_config=vision_language_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config)
+
+
+def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+ """Get a model loader based on the load format."""
+
+ if isinstance(load_config.load_format, type):
+ return load_config.load_format(load_config)
+
+ if load_config.load_format == LoadFormat.AUTO:
+ update_megatron_weight_loader()
+ return MegatronLoader(load_config)
+
+ # NOTE(sgm): change the weight_loader function in runtime
+ if load_config.load_format == LoadFormat.MEGATRON:
+ update_megatron_weight_loader()
+ return MegatronLoader(load_config)
+
+ if load_config.load_format == LoadFormat.HF:
+ update_hf_weight_loader()
+ return HFLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DTENSOR:
+ update_dtensor_weight_loader()
+ return DTensorLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_HF:
+ update_hf_weight_loader()
+ return DummyModelLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_MEGATRON:
+ update_megatron_weight_loader()
+ return DummyModelLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_DTENSOR:
+ update_dtensor_weight_loader()
+ return DummyModelLoader(load_config)
+
+ raise ValueError('load format not supported in verl: {}, only support {} and {}'.format(
+ load_config.load_format, LoadFormat.MEGATRON, LoadFormat.HF))
+
+
+class DummyModelLoader(BaseModelLoader):
+ """Model loader that will set model weights to random values."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig], parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
+ # NOTE(woosuk): For accurate performance evaluation, we assign
+ # random values to the weights.
+ # initialize_dummy_weights(model)
+ return model.eval()
+
+
+class MegatronLoader(BaseModelLoader):
+ """Model loader that can load the model weights from partitioned megatron model."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
+ # NOTE(shengguangming) Load the weights from the actor model
+ pass
+ # if isinstance(actor_model, nn.Module):
+ # load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
+ # else:
+ # load_weights(actor_weights=actor_model, vllm_model=model)
+ # return actor_model
+
+ def load_model(self, actor_model: Union[PreTrainedModel,
+ Dict], model_config: ModelConfig, device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig],
+ parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
+
+ # TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
+ if isinstance(actor_model, nn.Module):
+ load_megatron_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
+ vllm_model=model)
+ else:
+ load_megatron_weights(actor_weights=actor_model, vllm_model=model)
+
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+class HFLoader(BaseModelLoader):
+ """Model loader that can load the model weights from model's full params."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def _get_weights_iterator(self, actor_model: Union[PreTrainedModel, Dict]):
+ if isinstance(actor_model, Dict):
+ return actor_model.items()
+ elif isinstance(actor_model, nn.Module):
+ return dict(actor_model.named_parameters()).items()
+ else:
+ raise ValueError(f'actor model should be Dict or nn.Module, but get {type(actor_model)}')
+
+ def load_model(self, actor_model: Union[PreTrainedModel,
+ Dict], model_config: ModelConfig, device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig],
+ parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ # with torch.device(device_config.device):
+ # NOTE(sgm): init the model in cpu
+ model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
+ model.load_weights(self._get_weights_iterator(actor_model))
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+class DTensorLoader(BaseModelLoader):
+ """Model loader that can load the model weights from partitioned megatron model."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
+ # NOTE(shengguangming) Load the weights from the actor model
+ pass
+ # if isinstance(actor_model, nn.Module):
+ # load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
+ # else:
+ # load_weights(actor_weights=actor_model, vllm_model=model)
+ # return actor_model
+
+ def load_model(self, actor_model: Union[PreTrainedModel,
+ Dict], model_config: ModelConfig, device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig],
+ parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
+
+ # TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
+ if isinstance(actor_model, nn.Module):
+ load_dtensor_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
+ vllm_model=model)
+ else:
+ load_dtensor_weights(actor_weights=actor_model, vllm_model=model)
+
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+# FIXME(sgm): hack the _get_logits function in vllm v0.4.2
+# as they use ray, the _get_logits result will only need to return to the driver node,
+# therefore gather is enough. However, we use SPMD instead of a central scheduler,
+# all_gather is required (aligned with v0.2.6)
+def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+ embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
+ # Get the logits for the next tokens.
+ logits = torch.matmul(hidden_states, embedding.t())
+ if embedding_bias is not None:
+ logits += embedding_bias
+ logits = tensor_model_parallel_all_gather(logits)
+ # Remove paddings in vocab (if any).
+ if logits is not None:
+ logits = logits[:, :self.org_vocab_size]
+ return logits
+
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+
+LogitsProcessor._get_logits = _get_logits
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/model_runner.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1604b03630456a8adc44b31cec767f69f709899e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
@@ -0,0 +1,281 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
+
+import torch
+import torch.nn as nn
+from enum import IntEnum
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+from vllm.attention import (AttentionMetadata, get_attn_backend)
+from vllm.config import (DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor import SamplingMetadata
+from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData, SequenceGroupMetadata)
+from vllm.utils import (CudaMemoryProfiler, is_hip, is_pin_memory_available)
+from vllm.worker.model_runner import ModelRunner, CUDAGraphRunner
+
+from .model_loader import get_model
+from .config import ModelConfig, LoadConfig
+
+logger = init_logger(__name__)
+
+
+# How batches are constructed.
+class BatchType(IntEnum):
+ # Every batch is prefill.
+ PREFILL = 0
+ # Every batch is decode.
+ DECODE = 1
+ # Batch is a mixture of prefill and decode.
+ MIXED = 2
+
+
+class ModelRunner(ModelRunner):
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ kv_cache_dtype: Optional[str] = "auto",
+ vision_language_config: Optional[VisionLanguageConfig] = None,
+ ):
+ self.model_config = model_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.lora_config = lora_config
+ self.load_config = load_config
+
+ # model_config can be None in tests/samplers/test_sampler.py.
+ # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
+ self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None)
+ self.device_config = (device_config if device_config is not None else DeviceConfig())
+ self.device = self.device_config.device
+
+ # NOTE(sgm): add for verl
+ self.model = model # this will be replaced by get_model()
+
+ # Set after load_model.
+ self.lora_manager: LRUCacheWorkerLoRAManager = None
+
+ self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+ self.graph_memory_pool: Optional[Tuple[int, int]] = None # Set during graph capture.
+
+ self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture if self.model_config is not None else 0)
+
+ self.pin_memory = is_pin_memory_available()
+ self.kv_cache_dtype = kv_cache_dtype
+ self.vision_language_config = vision_language_config
+
+ self.attn_backend = get_attn_backend(self.model_config.dtype if model_config is not None else None)
+
+ # Lazy initialization
+ self.block_size: int # Set after initial profiling.
+ # When using CUDA graph, the input block tables must be padded to
+ # max_seq_len_to_capture. However, creating the block table in
+ # Python can be expensive. To optimize this, we cache the block table
+ # in numpy and only copy the actual input content at every iteration.
+ # The shape of the cached block table will be
+ # (max batch size to capture, max context len to capture / block size).
+ self.graph_block_tables: torch.Tensor # Set after initial profiling.
+
+ # Set if the backend is flashinfer.
+ self.flashinfer_workspace_buffer: torch.Tensor
+
+ # NOTE(sgm): initialize model using the actor model
+ def load_model(self) -> None:
+ with CudaMemoryProfiler() as m:
+ self.model = get_model(actor_model=self.model,
+ model_config=self.model_config,
+ device_config=self.device_config,
+ lora_config=self.lora_config,
+ load_config=self.load_config,
+ parallel_config=self.parallel_config,
+ scheduler_config=self.scheduler_config,
+ vision_language_config=self.vision_language_config)
+ self.model_memory_usage = m.consumed_memory
+ logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30))
+
+ if self.lora_config:
+ assert hasattr(self.model, "supported_lora_modules") and self.model.supported_lora_modules, (
+ "Model does not support LoRA")
+ assert hasattr(self.model, "embedding_modules"), "Model does not have embedding_modules"
+ assert hasattr(self.model, "embedding_padding_modules"), "Model does not have embedding_padding_modules"
+ self.lora_manager = LRUCacheWorkerLoRAManager(self.scheduler_config.max_num_seqs,
+ self.scheduler_config.max_num_batched_tokens, self.vocab_size,
+ self.lora_config, self.device, self.model.embedding_modules,
+ self.model.embedding_padding_modules)
+ self.model = self.lora_manager.create_lora_manager(self.model)
+
+ if self.kv_cache_dtype == "fp8" and is_hip():
+ # Currently scaled KV cache is only enabled on ROCm
+ if self.model_config.quantization_param_path is not None:
+ if callable(getattr(self.model, "load_kv_cache_scales", None)):
+ self.model.load_kv_cache_scales(self.model_config.quantization_param_path)
+ else:
+ raise RuntimeError(
+ "Using FP8 KV cache and scaling factors provided but "
+ "model %s does not support loading scaling factors.", self.model.__class__)
+ else:
+ logger.warning("Using FP8 KV cache but no scaling factors "
+ "provided. Defaulting to scaling factors of 1.0. "
+ "This may lead to less accurate results!")
+ elif self.model_config.quantization_param_path is not None:
+ logger.warning("KV cache scaling factors provided, "
+ "but the KV cache data type is not FP8. "
+ "KV cache scaling factors will not be used.")
+
+ def prepare_input_tensors(
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, Set[LoRARequest], LoRAMapping,
+ torch.Tensor]:
+ # NOTE(sgm): all workers prepare the input in the same way
+ prefill_reqs = []
+ decode_reqs = []
+ for seq_group_meta in seq_group_metadata_list:
+ if seq_group_meta.is_prompt:
+ prefill_reqs.append(seq_group_meta)
+ else:
+ decode_reqs.append(seq_group_meta)
+
+ # Prepare input tensors.
+ (
+ input_tokens,
+ input_positions,
+ prefill_attn_metadata,
+ seq_lens,
+ query_lens,
+ lora_index_mapping,
+ lora_prompt_mapping,
+ lora_requests,
+ multi_modal_input,
+ slot_mapping,
+ ) = self._prepare_prompt(prefill_reqs)
+ (
+ decode_input_tokens,
+ decode_input_positions,
+ decode_attn_metadata,
+ decode_lora_index_mapping,
+ decode_lora_prompt_mapping,
+ decode_lora_requests,
+ decode_slot_mapping,
+ ) = self._prepare_decode(decode_reqs)
+ sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, seq_lens, query_lens, self.device,
+ self.pin_memory)
+
+ if not self.scheduler_config.chunked_prefill_enabled:
+ assert (len(prefill_reqs) and len(decode_reqs)) == 0
+
+ num_prefills = len(seq_lens)
+ num_prefill_tokens = len(input_tokens)
+ num_decode_tokens = len(decode_input_tokens)
+
+ # Coalesce tensors. Note that attn_metadata is currently not
+ # coalesced for simplicity.
+ input_tokens.extend(decode_input_tokens)
+ input_positions.extend(decode_input_positions)
+ slot_mapping.extend(decode_slot_mapping)
+ lora_index_mapping.extend(decode_lora_index_mapping)
+ lora_prompt_mapping.extend(decode_lora_prompt_mapping)
+ lora_requests.update(decode_lora_requests)
+
+ input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device)
+ input_positions = torch.tensor(input_positions, dtype=torch.long, device=self.device)
+ slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device)
+
+ if self.lora_config:
+ lora_mapping = LoRAMapping(
+ lora_index_mapping,
+ lora_prompt_mapping,
+ )
+ else:
+ lora_mapping = None
+
+ # Broadcast the metadata.
+ # If batch contains both prefill and decode, it sends 2 broadcasts.
+ # If it only contains 1 type, it triggers a single broadcast.
+ if (prefill_attn_metadata is not None and decode_attn_metadata is not None):
+ batch_type = BatchType.MIXED
+ elif prefill_attn_metadata is not None:
+ batch_type = BatchType.PREFILL
+ else:
+ batch_type = BatchType.DECODE
+
+ attn_metadata = AttentionMetadata(
+ num_prefills=num_prefills,
+ slot_mapping=slot_mapping,
+ num_prefill_tokens=num_prefill_tokens,
+ num_decode_tokens=num_decode_tokens,
+ prefill_metadata=prefill_attn_metadata,
+ decode_metadata=decode_attn_metadata,
+ kv_cache_dtype=self.kv_cache_dtype,
+ )
+
+ return (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping,
+ multi_modal_input)
+
+ @torch.inference_mode()
+ def execute_model(
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ kv_caches: List[torch.Tensor],
+ ) -> Optional[SamplerOutput]:
+ (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping,
+ multi_modal_input) = self.prepare_input_tensors(seq_group_metadata_list)
+
+ if self.lora_config:
+ self.set_active_loras(lora_requests, lora_mapping)
+
+ # Currently cuda graph is only supported by the decode phase.
+ prefill_meta = attn_metadata.prefill_metadata
+ decode_meta = attn_metadata.decode_metadata
+ if prefill_meta is None and decode_meta.use_cuda_graph:
+ graph_batch_size = input_tokens.shape[0]
+ model_executable = self.graph_runners[graph_batch_size]
+ else:
+ model_executable = self.model
+ execute_model_kwargs = {
+ "input_ids": input_tokens,
+ "positions": input_positions,
+ "kv_caches": kv_caches,
+ "attn_metadata": attn_metadata,
+ }
+ if self.vision_language_config:
+ execute_model_kwargs.update({"image_input": multi_modal_input})
+ hidden_states = model_executable(**execute_model_kwargs)
+
+ # Compute the logits.
+ logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+ # Only perform sampling in the driver worker.
+ # if not self.is_driver_worker:
+ # return None
+
+ # TODO(sgm): perform sampling on rank 0
+ # Sample the next token.
+ output = self.model.sample(
+ logits=logits,
+ sampling_metadata=sampling_metadata,
+ )
+
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..be7464a2a50e3a968f8d0636e7a40f2d6cf57f56
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
@@ -0,0 +1,294 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""Model and data parallel groups."""
+import os
+import torch
+import torch.distributed
+from typing import Optional
+
+import vllm.distributed.parallel_state as ps
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from torch.distributed.device_mesh import init_device_mesh
+
+logger = init_logger(__name__)
+"""
+This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
+- We assume the Megatron tp+dp+pp world is already established before calling this function.
+
+"""
+
+# Device mesh for using DTensor
+_DEVICE_MESH = None
+
+# Tensor model parallel group that the current rank belongs to.
+_TP_DEVICE_GROUP = None
+_TP_CPU_GROUP = None
+
+
+# This method is for initializing the ParallelGroup when using HybridEngine
+def initialize_parallel_state(
+ distributed_init_method: str = "env://",
+ backend: str = "nccl",
+ tensor_model_parallel_size: int = 1,
+ num_tp_per_train_tp: int = 1,
+ pipeline_model_parallel_size: int = 1,
+):
+ # torch.distributed.all_reduce does not free the input tensor until
+ # the synchronization point. This causes the memory usage to grow
+ # as the number of all_reduce calls increases. This env var disables
+ # this behavior.
+ # Related issue:
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+ # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
+ rank = int(os.getenv("RANK", "-1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+ # Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ ps.init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
+ if torch.distributed.get_world_size() > 1:
+ # NOTE: build a sepearate inference group with infer tp & micro dp
+ initialize_model_parallel_for_vllm(tensor_model_parallel_size=tensor_model_parallel_size,
+ num_tensor_model_parallel_groups_per_train_tp=num_tp_per_train_tp)
+ else:
+ initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
+
+
+def ensure_model_parallel_initialized(
+ tensor_model_parallel_size: int,
+ pipeline_model_parallel_size: int = 1,
+ backend: Optional[str] = None,
+) -> None:
+ """Helper to initialize model parallel groups if they are not initialized,
+ or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+ values if the model parallel groups are initialized.
+ """
+ # get the backend of _DEVICE_WORLD_GROUP
+ backend = backend or torch.distributed.get_backend()
+ if not model_parallel_is_initialized():
+ initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
+ return
+
+ assert (get_tensor_model_parallel_world_size() == tensor_model_parallel_size), (
+ "tensor parallel group already initialized, but of unexpected size: "
+ f"{get_tensor_model_parallel_world_size()=} vs. "
+ f"{tensor_model_parallel_size=}")
+ # assert (get_pipeline_model_parallel_world_size(
+ # ) == pipeline_model_parallel_size), (
+ # "pipeline parallel group already initialized, but of unexpected size: "
+ # f"{get_pipeline_model_parallel_world_size()=} vs. "
+ # f"{pipeline_model_parallel_size=}")
+
+
+def model_parallel_is_initialized():
+ """Check if tensor and pipeline parallel groups are initialized."""
+ return (ps._TP_DEVICE_GROUP is not None)
+ # and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
+
+
+def initialize_model_parallel_for_vllm(tensor_model_parallel_size: int,
+ num_tensor_model_parallel_groups_per_train_tp: int = 1) -> None:
+ from torch.distributed import new_group
+ # Get world size and rank. Ensure some consistencies.
+ assert torch.distributed.is_initialized()
+
+ assert isinstance(tensor_model_parallel_size, int)
+
+ # assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group
+ # assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group
+
+ # Build the tensor model-parallel groups.
+ assert ps._TP_DEVICE_GROUP is None, ("tensor model parallel group is already initialized")
+
+ global _TP_DEVICE_GROUP
+ global _TP_CPU_GROUP
+ global _DEVICE_MESH
+
+ world_size: int = torch.distributed.get_world_size()
+
+ rank = torch.distributed.get_rank()
+
+ backend = torch.distributed.get_backend()
+
+ num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+
+ if num_tensor_model_parallel_groups_per_train_tp == 1:
+ # if tensor_model_parallel_size == train_tensor_parallel_size:
+ # using the same tp group as Megatron/vllm
+ for i in range(num_tensor_model_parallel_groups):
+ ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+ group = torch.distributed.new_group(ranks, backend=backend)
+ cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+ if rank in ranks:
+ _TP_DEVICE_GROUP = group
+ _TP_CPU_GROUP = cpu_group
+ ps._TP_DEVICE_GROUP = group
+ ps._TP_CPU_GROUP = cpu_group
+
+ # no _MICRO_DATA_PARALLEL_GROUP
+ else:
+ # initialize a micro_dp group and a tp group
+ # assume training tp=4, infer tp=2, then, weight is partitioned as
+ # [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
+
+ # Build the inference tp groups
+ # train_tp = train_tensor_parallel_size
+ train_tp = num_tensor_model_parallel_groups_per_train_tp * tensor_model_parallel_size
+ # num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
+ assert _TP_DEVICE_GROUP is None, ("tensor model parallel group is already initialized")
+ for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
+ start = train_tp * i
+ end = train_tp * (i + 1)
+ for j in range(num_tensor_model_parallel_groups_per_train_tp):
+ ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
+ for i in range(len(ranks)):
+ ranks[i] += j
+ group = torch.distributed.new_group(ranks)
+ cpu_group = torch.distributed.new_group(ranks, backend='gloo')
+ if rank in ranks:
+ _TP_DEVICE_GROUP = group
+ _TP_CPU_GROUP = cpu_group
+ ps._TP_DEVICE_GROUP = _TP_DEVICE_GROUP
+ ps._TP_CPU_GROUP = cpu_group
+
+ # Build the pipeline model-parallel groups.
+ # global _PIPELINE_MODEL_PARALLEL_GROUP
+ # global _PIPELINE_GLOBAL_RANKS
+ # assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
+
+ # ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
+ # ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
+
+
+def initialize_model_parallel(
+ tensor_model_parallel_size: int = 1,
+ pipeline_model_parallel_size: int = 1,
+ backend: Optional[str] = None,
+) -> None:
+ """
+ NOTE: This method is a hack from the open-sourced version without
+ asertion of world_size = tp * pp
+
+ Initialize model parallel groups.
+
+ Arguments:
+ tensor_model_parallel_size: number of GPUs used for tensor model
+ parallelism.
+ pipeline_model_parallel_size: number of GPUs used for pipeline model
+ parallelism.
+
+ Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+ use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+ the model pipeline. The present function will
+ create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+ 4 tensor model-parallel groups:
+ [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+ 2 pipeline model-parallel groups:
+ [g0, g2, g4, g6], [g1, g3, g5, g7]
+ Note that for efficiency, the caller should make sure adjacent ranks
+ are on the same DGX box. For example if we are using 2 DGX-1 boxes
+ with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+ ranks 8 to 15 belong to the second box.
+ """
+ # Get world size and rank. Ensure some consistencies.
+ assert torch.distributed.is_initialized()
+ world_size: int = torch.distributed.get_world_size()
+ # get the backend of _DEVICE_WORLD_GROUP
+ backend = backend or torch.distributed.get_backend()
+
+ # NOTE(sgm) we don't assert world_size == tp * pp
+ # DP is not managed by vllm but by the veRL WorkerGroup
+
+ num_tensor_model_parallel_groups: int = (world_size // tensor_model_parallel_size)
+ num_pipeline_model_parallel_groups: int = (world_size // pipeline_model_parallel_size)
+ rank = torch.distributed.get_rank()
+
+ # Build device mesh for TP
+ if num_tensor_model_parallel_groups > 1:
+ device_mesh = init_device_mesh("cuda", (num_tensor_model_parallel_groups, tensor_model_parallel_size),
+ mesh_dim_names=("replicate", "tp_shard"))
+ else:
+ device_mesh = init_device_mesh("cuda", (tensor_model_parallel_size,), mesh_dim_names=["tp_shard"])
+ shard_group = device_mesh.get_group(mesh_dim="tp_shard")
+
+ # Build the tensor model-parallel groups.
+ global _TP_DEVICE_GROUP, _TP_CPU_GROUP
+ global _DEVICE_MESH
+ assert _TP_DEVICE_GROUP is None, ("tensor model parallel group is already initialized")
+ assert _DEVICE_MESH is None, ("device mesh in vllm is already initialized")
+
+ _DEVICE_MESH = device_mesh
+ # for i in range(num_tensor_model_parallel_groups):
+ # ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+ # group = torch.distributed.new_group(ranks, backend=backend)
+ # cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+ # assert torch.distributed.get_process_group_ranks(shard_group) == torch.distributed.get_process_group_ranks(cpu_group)
+ # ranks = torch.distributed.get_process_group_ranks(shard_group)
+ # cpu_group = torch.distributed.new_group(ranks, backend="gloo") # TODO: this will hang
+ # cpu_group = torch.distributed.new_group(, backend="gloo")
+ # if rank == 0:
+ # print(f'rank: {rank}')
+ # print(f'ranks: {ranks}')
+ # print(f'torch.distributed.get_process_group_ranks(shard_group): {torch.distributed.get_process_group_ranks(shard_group)}')
+ # if rank in ranks:
+ _TP_DEVICE_GROUP = shard_group
+ ps._TP_DEVICE_GROUP = _TP_DEVICE_GROUP
+ # ps._TP_CPU_GROUP = cpu_group # TODO: will hang when used with device mesh
+
+ # TODO: init using device mesh
+ # Build the pipeline model-parallel groups.
+ assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
+ for i in range(num_pipeline_model_parallel_groups):
+ ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+ group = torch.distributed.new_group(ranks, backend=backend)
+ if rank in ranks:
+ ps._PIPELINE_MODEL_PARALLEL_GROUP = group
+ ps._PIPELINE_GLOBAL_RANKS = ranks
+
+
+"""
+Device mesh utilities
+"""
+
+
+def get_device_mesh():
+ assert _DEVICE_MESH is not None, ("device mesh is not initialized")
+ return _DEVICE_MESH
+
+
+"""
+Tensor model parallel utilities
+"""
+
+
+def get_tensor_model_parallel_group():
+ """Get the tensor model parallel group the caller rank belongs to."""
+ assert _TP_DEVICE_GROUP is not None, ("tensor model parallel group is not initialized")
+ return _TP_DEVICE_GROUP
+
+
+def get_tensor_model_parallel_world_size():
+ """Return world size for the tensor model parallel group."""
+ return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_rank():
+ """Return my rank for the tensor model parallel group."""
+ return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_src_rank():
+ """Calculate the global rank corresponding to the first local rank
+ in the tensor model parallel group."""
+ global_rank = torch.distributed.get_rank()
+ local_world_size = get_tensor_model_parallel_world_size()
+ return (global_rank // local_world_size) * local_world_size
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b97bb600ac318cc4b769805a5dcedc24b296f4b3
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
@@ -0,0 +1,218 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/executor/gpu_executor.py
+import os
+import socket
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import torch
+import vllm.envs as envs
+from vllm.executor.executor_base import ExecutorBase, ExecutorAsyncBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SamplerOutput, ExecuteModelRequest
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig,
+ VisionLanguageConfig)
+from .config import ModelConfig, LoadConfig
+
+logger = init_logger(__name__)
+
+
+class SPMDGPUExecutor(ExecutorBase):
+ """SPMD-based multi-GPU executor implementations."""
+
+ def __init__(
+ self,
+ model, # pytorch model itself or its parameter dict
+ model_config: ModelConfig,
+ cache_config: CacheConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig],
+ speculative_config: Optional[SpeculativeConfig],
+ ) -> None:
+ self.model_config = model_config
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ self.load_config = load_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.vision_language_config = vision_language_config
+ self.speculative_config = speculative_config
+
+ distributed_init_method = initialize_cluster(parallel_config)
+ self._init_executor(model, distributed_init_method)
+
+ # TODO(sgm): verl not support speculative decode now
+ def _init_executor(self, model, distributed_init_method) -> None:
+ assert (not self.speculative_config), "Speculative decoding not yet supported for multi-GPU backend."
+
+ # Create the parallel worker for each GPU.
+ self._init_workers_sp(model, distributed_init_method)
+
+ def _init_workers_sp(self, model, distributed_init_method: str):
+ # Lazy import the Worker to avoid importing torch.cuda/xformers
+ # before CUDA_VISIBLE_DEVICES is set in the Worker
+ from .worker import Worker # pylint: disable=import-outside-toplevel
+
+ rank = int(os.getenv("RANK"))
+ local_rank = int(os.getenv("LOCAL_RANK"))
+ print(f'local rank {local_rank}')
+
+ self.worker = Worker(
+ model,
+ self.model_config,
+ self.parallel_config,
+ self.scheduler_config,
+ self.device_config,
+ self.cache_config,
+ self.load_config,
+ local_rank,
+ rank,
+ distributed_init_method,
+ lora_config=self.lora_config,
+ vision_language_config=self.vision_language_config,
+ )
+
+ # NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
+ self.worker.init_device()
+ self.worker.load_model()
+
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
+ """Determine the number of available KV blocks.
+
+ This invokes `determine_num_available_blocks` on each worker and takes
+ the min of the results, guaranteeing that the selected cache sizes are
+ compatible with all workers.
+
+ Returns:
+ - tuple[num_gpu_blocks, num_cpu_blocks]
+ """
+ # Get the maximum number of blocks that can be allocated on GPU and CPU.
+ num_blocks = self.worker.determine_num_available_blocks()
+
+ # NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
+ # have its own scheduler
+ num_gpu_blocks = num_blocks[0]
+ num_cpu_blocks = num_blocks[1]
+
+ return num_gpu_blocks, num_cpu_blocks
+
+ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+ """Initialize the KV cache in all workers.
+ """
+
+ # NOTE: We log here to avoid multiple logs when number of workers is
+ # greater than one. We could log in the engine, but not all executors
+ # have GPUs.
+ logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
+
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+ if torch.distributed.get_rank() == 0:
+ print(
+ f'before init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB'
+ )
+ self.worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks)
+ if torch.distributed.get_rank() == 0:
+ print(
+ f'after init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB'
+ )
+
+ # NOTE(sgm): This will not profile & capture the model(CUDAGraph) when rebuilding KVCache
+ def init_cache_engine(self) -> None:
+ self.worker._init_cache_engine()
+
+ def free_cache_engine(self) -> None:
+ self.worker.free_cache_engine()
+
+ def execute_model(self, execute_model_req) -> List[SamplerOutput]:
+ all_outputs = self.worker.execute_model(execute_model_req=execute_model_req)
+
+ # NOTE(sgm):
+ # Each GPU in vllm under verl has its own spmd_gpu_executor, therefore all GPUs should return the outputs
+ # In vllm with ray, only the driver worker returns the sampling results.
+ return all_outputs
+
+ def add_lora(self, lora_request: LoRARequest) -> bool:
+ assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+ return self.worker.add_lora(lora_request=lora_request)
+
+ def remove_lora(self, lora_id: int) -> bool:
+ assert lora_id > 0, "lora_id must be greater than 0."
+ return self.worker.remove_lora(lora_id=lora_id)
+
+ def list_loras(self) -> Set[int]:
+ return self.worker.list_loras()
+
+ def check_health(self) -> None:
+ # SPMDExecutor will always be healthy as long as
+ # it's running.
+ return
+
+ # NOTE(sgm): add for verl
+ def offload_model_weights(self) -> None:
+ self.worker.offload_model_weights()
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.worker.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+
+def initialize_cluster(
+ parallel_config: ParallelConfig,
+ engine_use_ray: bool = False,
+ ray_address: Optional[str] = None,
+) -> Tuple[str, Optional[None]]:
+ """Initialize the distributed cluster probably with Ray.
+
+ Args:
+ parallel_config: The configurations for parallel execution.
+
+ Returns:
+ The `distributed_init_method` is the address for initializing the
+ distributed backend.
+ """
+
+ # Initialize cluster locally.
+ port = get_open_port()
+ # We need to setup the distributed init method to make sure
+ # the distributed megatron code (e.g., get world size) works correctly.
+ # distributed_init_method = f"tcp://localhost:{port}"
+ distributed_init_method = 'env://'
+ return distributed_init_method
+
+
+def get_open_port():
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ s.bind(("", 0))
+ return s.getsockname()[1]
+
+
+# TODO(sgm): not implemented async executor yet
+class SPMDGPUExecutorAsync(SPMDGPUExecutor, ExecutorAsyncBase):
+
+ async def execute_model_async(self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+ """Executes one model step on the given sequences."""
+ raise NotImplementedError
+
+ async def check_health_async(self) -> None:
+ """Checks if the executor is healthy. If not, it should raise an
+ exception."""
+ self.check_health()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa625a0338686d61816e838ef802cde327fc95c4
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
@@ -0,0 +1,77 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+
+from typing import List, Optional, Tuple, Union
+
+from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast)
+
+from vllm.lora.request import LoRARequest
+from vllm.utils import make_async, LRUCache
+from vllm.transformers_utils.tokenizers import *
+
+
+class TokenizerGroup:
+ """A group of tokenizers that can be used for LoRA adapters."""
+
+ def __init__(self, tokenizer: PreTrainedTokenizer, enable_lora: bool, max_num_seqs: int,
+ max_input_length: Optional[int]):
+ self.enable_lora = enable_lora
+ self.max_input_length = max_input_length
+ self.tokenizer = tokenizer
+ self.lora_tokenizers = LRUCache[PreTrainedTokenizer](capacity=max_num_seqs) if enable_lora else None
+
+ def ping(self) -> bool:
+ """Check if the tokenizer group is alive."""
+ return True
+
+ def get_max_input_len(self, lora_request: Optional[LoRARequest] = None) -> Optional[int]:
+ """Get the maximum input length for the LoRA request."""
+ return self.max_input_length
+
+ def encode(self,
+ prompt: str,
+ request_id: Optional[str] = None,
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
+ tokenizer = self.get_lora_tokenizer(lora_request)
+ return tokenizer.encode(prompt)
+
+ async def encode_async(self,
+ prompt: str,
+ request_id: Optional[str] = None,
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
+ tokenizer = await self.get_lora_tokenizer_async(lora_request)
+ return tokenizer.encode(prompt)
+
+ def get_lora_tokenizer(self, lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
+ if not lora_request or not self.enable_lora:
+ return self.tokenizer
+ if lora_request.lora_int_id not in self.lora_tokenizers:
+ # TODO(sgm): the lora tokenizer is also passed, but may be different
+ tokenizer = self.tokenizer
+ # tokenizer = (get_lora_tokenizer(
+ # lora_request, **self.tokenizer_config) or self.tokenizer)
+ self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+ return tokenizer
+ else:
+ return self.lora_tokenizers.get(lora_request.lora_int_id)
+
+ # FIXME(sgm): for simplicity, we assign the special token here
+ @property
+ def pad_token_id(self):
+ return self.tokenizer.pad_token_id
+
+ @property
+ def eos_token_id(self):
+ return self.tokenizer.eos_token_id
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/worker.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fab3e41fe87ea599f251f5da2ebb67b54f84b81
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_4_2/worker.py
@@ -0,0 +1,292 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
+"""A GPU worker class."""
+import os
+import gc
+from typing import Dict, List, Tuple, Optional, Union
+
+import torch
+import torch.distributed
+import torch.nn as nn
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.model_executor import set_random_seed
+from vllm.sequence import SamplerOutput, ExecuteModelRequest
+from vllm.worker.cache_engine import CacheEngine
+from vllm.distributed.device_communicators import pynccl_utils
+from vllm.distributed.device_communicators.custom_all_reduce import (init_custom_ar)
+# TODO(sgm): check why vllm has similar file in vllm.model_executor.parallel_utils.parallel_state
+from vllm.distributed import get_tensor_model_parallel_cpu_group, init_distributed_environment, get_tensor_model_parallel_group
+from vllm.worker.worker import Worker, _check_if_gpu_supports_dtype
+
+from .model_runner import ModelRunner
+from .megatron_weight_loaders import load_megatron_weights
+from .hf_weight_loader import load_hf_weights
+from .dtensor_weight_loaders import load_dtensor_weights
+from .parallel_state import (ensure_model_parallel_initialized)
+from .config import ModelConfig, LoadConfig, LoadFormat
+
+
+class Worker(Worker):
+ """A worker class that executes (a partition of) the model on a GPU.
+
+ Each worker is associated with a single GPU. The worker is responsible for
+ maintaining the KV cache and executing the model on the GPU. In case of
+ distributed inference, each worker is assigned a partition of the model.
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ cache_config: CacheConfig,
+ load_config: LoadConfig,
+ local_rank: int,
+ rank: int,
+ distributed_init_method: str,
+ lora_config: Optional[LoRAConfig] = None,
+ vision_language_config: Optional[VisionLanguageConfig] = None,
+ is_driver_worker: bool = False,
+ ) -> None:
+ # self.model = model # will be replaced in the init_model
+ self.model_config = model_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.cache_config = cache_config
+ self.local_rank = local_rank
+ self.rank = rank
+ self.distributed_init_method = distributed_init_method
+ self.lora_config = lora_config
+ self.load_config = load_config
+ self.is_driver_worker = is_driver_worker
+ if self.is_driver_worker:
+ assert self.rank == 0, "The driver worker must have rank 0."
+
+ self.vision_language_config = vision_language_config
+ if self.vision_language_config:
+ assert not self.lora_config, ("To be tested: vision language model with LoRA settings.")
+
+ self.model_runner = ModelRunner(
+ model,
+ model_config,
+ parallel_config,
+ scheduler_config,
+ device_config,
+ load_config=load_config,
+ lora_config=self.lora_config,
+ kv_cache_dtype=self.cache_config.cache_dtype,
+ vision_language_config=vision_language_config,
+ )
+
+ # Uninitialized cache engine. Will be initialized by
+ # init_cache_engine.
+ self.cache_engine: CacheEngine = None
+ self.gpu_cache: List[torch.Tensor] = None
+
+ # NOTE(sgm): For offloading inference engine params
+ self.cpu_model = None
+
+ def init_device(self) -> None:
+ if self.device_config.device.type == "cuda":
+ # torch.distributed.all_reduce does not free the input tensor until
+ # the synchronization point. This causes the memory usage to grow
+ # as the number of all_reduce calls increases. This env var disables
+ # this behavior.
+ # Related issue:
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+ # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
+ self.rank = self.rank if self.rank is not None else int(os.getenv("RANK", "-1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+ self.device = torch.device(f"cuda:{local_rank}")
+ if self.rank < 0:
+ raise ValueError("Invalid or unspecified rank.")
+ torch.cuda.set_device(self.device)
+
+ # Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ self.parallel_config.world_size = world_size
+
+ _check_if_gpu_supports_dtype(self.model_config.dtype)
+ torch.cuda.empty_cache()
+ self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+ else:
+ raise RuntimeError(f"Not support device type: {self.device_config.device}")
+
+ # Initialize the distributed environment.
+ init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method,
+ self.local_rank)
+ # Set random seed.
+ set_random_seed(self.model_config.seed)
+ # self.model = get_model(actor_model=self.model, model_config=self.model_config)
+
+ @torch.inference_mode()
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
+ """Profiles the peak memory usage of the model to determine how many
+ KV blocks may be allocated without OOMs.
+
+ The engine will first conduct a profiling of the existing memory usage.
+ Then, it calculate the maximum possible number of GPU and CPU blocks
+ that can be allocated with the remaining free memory.
+
+ .. tip::
+ You may limit the usage of GPU memory
+ by adjusting the `gpu_memory_utilization` parameter.
+ """
+ # Profile the memory usage of the model and get the maximum number of
+ # cache blocks that can be allocated with the remaining free memory.
+ torch.cuda.empty_cache()
+ # torch.cuda.reset_peak_memory_stats()
+
+ # Execute a forward pass with dummy inputs to profile the memory usage
+ # of the model.
+ self.model_runner.profile_run()
+
+ # Calculate the number of blocks that can be allocated with the
+ # profiled peak memory.
+ torch.cuda.synchronize()
+ free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+ peak_memory = total_gpu_memory - free_gpu_memory
+
+ assert peak_memory > 0, ("Error in memory profiling. This happens when the GPU memory was "
+ "not properly cleaned up before initializing the vLLM instance.")
+
+ cache_block_size = self.get_cache_block_size_bytes()
+
+ # NOTE(sgm) use the remaining memory
+ num_gpu_blocks = int((free_gpu_memory * self.cache_config.gpu_memory_utilization) // cache_block_size)
+ # num_gpu_blocks = int((total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // cache_block_size)
+
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size)
+ num_gpu_blocks = max(num_gpu_blocks, 0)
+ num_cpu_blocks = max(num_cpu_blocks, 0)
+ if self.model_runner.lora_manager:
+ self.model_runner.remove_all_loras()
+
+ # NOTE(sgm): Add for verl, synchronize number of blocks with all the rank
+ num_gpu_blocks = torch.tensor([num_gpu_blocks], device='cuda')
+ num_cpu_blocks = torch.tensor([num_cpu_blocks], device='cuda')
+ torch.distributed.all_reduce(num_gpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group())
+ torch.distributed.all_reduce(num_cpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group())
+ num_gpu_blocks = num_gpu_blocks.item()
+ num_cpu_blocks = num_cpu_blocks.item()
+ gc.collect()
+ torch.cuda.empty_cache()
+ return num_gpu_blocks, num_cpu_blocks
+
+ def _init_cache_engine(self):
+ if self.cache_engine is None and self.gpu_cache is None:
+ super()._init_cache_engine()
+
+ def free_cache_engine(self):
+ # ensure `enforce_eager=True`
+ self.cache_engine = None
+ self.gpu_cache = None
+
+ @torch.inference_mode()
+ def execute_model(self, execute_model_req: Optional[ExecuteModelRequest] = None) -> List[SamplerOutput]:
+
+ if execute_model_req is None:
+ seq_group_metadata_list = None
+ else:
+ seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
+ # NOTE(sgm): each SPMD rank will have identical input
+ assert seq_group_metadata_list is not None
+ assert execute_model_req is not None
+ num_seq_groups = len(seq_group_metadata_list)
+ blocks_to_swap_in = execute_model_req.blocks_to_swap_in
+ blocks_to_swap_out = execute_model_req.blocks_to_swap_out
+ blocks_to_copy = execute_model_req.blocks_to_copy
+
+ self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
+
+ # If there is no input, we don't need to execute the model.
+ if num_seq_groups == 0:
+ return []
+
+ output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache)
+
+ # Worker only supports single-step execution. Wrap the output in a list
+ # to conform to interface.
+ return [output]
+
+ # assume the input is .state_dict()
+ def sync_model_weights(self, actor_weights: Dict, load_format: str):
+ if load_format in [LoadFormat.MEGATRON, LoadFormat.AUTO]:
+ load_megatron_weights(actor_weights, self.model_runner.model)
+ elif load_format == LoadFormat.HF:
+ # full model state dict without no sharding
+ load_hf_weights(actor_weights, self.model_runner.model)
+ elif load_format == LoadFormat.DTENSOR:
+ load_dtensor_weights(actor_weights, self.model_runner.model)
+
+ def offload_model_weights(self) -> None:
+ if self.cpu_model == None:
+ self.cpu_model = {}
+ for name, params in self.model_runner.model.named_parameters():
+ self.cpu_model[name] = torch.empty_like(params, device='cpu')
+ params.data = self.cpu_model[name]
+ else:
+ for name, params in self.model_runner.model.named_parameters():
+ params.data = self.cpu_model[name]
+
+
+def init_worker_distributed_environment(
+ parallel_config: ParallelConfig,
+ rank: int,
+ distributed_init_method: Optional[str] = "env://",
+ local_rank: int = -1,
+) -> None:
+ """Initialize the distributed environment."""
+ # NOTE(sgm) use tcp://localhost:xxxx will hang in HF setting without megatron
+ init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank)
+
+ ensure_model_parallel_initialized(tensor_model_parallel_size=parallel_config.tensor_parallel_size,
+ pipeline_model_parallel_size=parallel_config.pipeline_parallel_size)
+
+ # TODO(sgm): check whether need this
+ # if pynccl_utils.is_initialized():
+ # pynccl_world_size = pynccl_utils.get_world_size()
+ # if pynccl_world_size != parallel_config.world_size:
+ # raise RuntimeError(
+ # "pynccl is already initialized but the pynccl world "
+ # "size does not match parallel_config.world_size "
+ # f"({pynccl_world_size} vs. {parallel_config.world_size}).")
+ # elif parallel_config.world_size > 1:
+ # # NOTE(woosuk): We don't initialize pynccl process group when world size
+ # # is 1.
+ # # NOTE(kaichao): By default, pynccl is initialized for tp group.
+ # pynccl_utils.init_process_group(
+ # group=get_tensor_model_parallel_cpu_group())
+
+ # # Initialize a custom fast all-reduce implementation.
+ # if not parallel_config.disable_custom_all_reduce:
+ # init_custom_ar()
+
+ # A small all_reduce for warmup.
+ torch.distributed.all_reduce(torch.zeros(1).cuda())
+ # if pynccl_utils.is_initialized():
+ # pynccl_utils.all_reduce(torch.zeros(1).cuda())
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/__init__.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c577277b8621421cb5e1c3dbb713dcb34519215
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py
@@ -0,0 +1,453 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
+
+import os
+import argparse
+import dataclasses
+import json
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
+
+import torch.nn as nn
+
+from transformers import PretrainedConfig
+from .config import ModelConfig, LoadConfig
+
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoRAConfig, MultiModalConfig,
+ ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig,
+ TokenizerPoolConfig)
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import str_to_int_tuple
+
+if TYPE_CHECKING:
+ from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (BaseTokenizerGroup)
+
+logger = init_logger(__name__)
+
+
+def nullable_str(val: str):
+ if not val or val == "None":
+ return None
+ return val
+
+
+@dataclass
+class EngineArgs:
+ """Arguments for vLLM engine."""
+ model_hf_config: PretrainedConfig = None # for verl
+ served_model_name = None # TODO(sgm): check this
+ # tokenizer: Optional[str] = None # TODO(sgm): check this
+ skip_tokenizer_init: bool = False
+ tokenizer_mode: str = 'auto'
+ trust_remote_code: bool = False
+ download_dir: Optional[str] = None
+ load_format: str = 'auto'
+ dtype: str = 'auto'
+ kv_cache_dtype: str = 'auto'
+ quantization_param_path: Optional[str] = None
+ seed: int = 0
+ max_model_len: Optional[int] = None
+ worker_use_ray: bool = False
+ # Note: Specifying a custom executor backend by passing a class
+ # is intended for expert use only. The API may change without
+ # notice.
+ distributed_executor_backend: Optional[Union[str, Type[ExecutorBase]]] = None
+ pipeline_parallel_size: int = 1
+ tensor_parallel_size: int = 1
+ max_parallel_loading_workers: Optional[int] = None
+ block_size: int = 16
+ enable_prefix_caching: bool = False
+ disable_sliding_window: bool = False
+ use_v2_block_manager: bool = False
+ swap_space: int = 4 # GiB
+ cpu_offload_gb: int = 0 # GiB
+ gpu_memory_utilization: float = 0.90
+ max_num_batched_tokens: Optional[int] = None
+ max_num_seqs: int = 256
+ max_logprobs: int = 20 # Default value for OpenAI Chat Completions API
+ disable_log_stats: bool = False
+ revision: Optional[str] = None
+ code_revision: Optional[str] = None
+ rope_scaling: Optional[dict] = None
+ rope_theta: Optional[float] = None
+ tokenizer_revision: Optional[str] = None
+ quantization: Optional[str] = None
+ enforce_eager: bool = False
+ max_context_len_to_capture: Optional[int] = None
+ max_seq_len_to_capture: int = 8192
+ disable_custom_all_reduce: bool = False
+ tokenizer_pool_size: int = 0
+ # Note: Specifying a tokenizer pool by passing a class
+ # is intended for expert use only. The API may change without
+ # notice.
+ tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
+ tokenizer_pool_extra_config: Optional[dict] = None
+ enable_lora: bool = False
+ max_loras: int = 1
+ max_lora_rank: int = 16
+ enable_prompt_adapter: bool = False
+ max_prompt_adapters: int = 1
+ max_prompt_adapter_token: int = 0
+ fully_sharded_loras: bool = False
+ lora_extra_vocab_size: int = 256
+ long_lora_scaling_factors: Optional[Tuple[float]] = None
+ lora_dtype: str = 'auto'
+ max_cpu_loras: Optional[int] = None
+ device: str = 'auto'
+ ray_workers_use_nsight: bool = False
+ num_gpu_blocks_override: Optional[int] = None
+ num_lookahead_slots: int = 0
+ model_loader_extra_config: Optional[dict] = None
+ ignore_patterns: Optional[Union[str, List[str]]] = None
+ preemption_mode: Optional[str] = None
+
+ scheduler_delay_factor: float = 0.0
+ enable_chunked_prefill: Optional[bool] = None
+
+ guided_decoding_backend: str = 'outlines'
+ # Speculative decoding configuration.
+ speculative_model: Optional[str] = None
+ speculative_draft_tensor_parallel_size: Optional[int] = None
+ num_speculative_tokens: Optional[int] = None
+ speculative_max_model_len: Optional[int] = None
+ speculative_disable_by_batch_size: Optional[int] = None
+ ngram_prompt_lookup_max: Optional[int] = None
+ ngram_prompt_lookup_min: Optional[int] = None
+ spec_decoding_acceptance_method: str = 'rejection_sampler'
+ typical_acceptance_sampler_posterior_threshold: Optional[float] = None
+ typical_acceptance_sampler_posterior_alpha: Optional[float] = None
+ qlora_adapter_name_or_path: Optional[str] = None
+ disable_logprobs_during_spec_decoding: Optional[bool] = None
+
+ otlp_traces_endpoint: Optional[str] = None
+
+ @staticmethod
+ def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+ """Shared CLI arguments for vLLM engine."""
+ # Model arguments
+ # TODO(shengguangming): delete the unused args
+ parser.add_argument('--model',
+ type=str,
+ default='facebook/opt-125m',
+ help='name or path of the huggingface model to use')
+ parser.add_argument('--tokenizer',
+ type=str,
+ default=EngineArgs.tokenizer,
+ help='name or path of the huggingface tokenizer to use')
+ parser.add_argument('--revision',
+ type=str,
+ default=None,
+ help='the specific model version to use. It can be a branch '
+ 'name, a tag name, or a commit id. If unspecified, will use '
+ 'the default version.')
+ parser.add_argument('--tokenizer-revision',
+ type=str,
+ default=None,
+ help='the specific tokenizer version to use. It can be a branch '
+ 'name, a tag name, or a commit id. If unspecified, will use '
+ 'the default version.')
+ parser.add_argument('--tokenizer-mode',
+ type=str,
+ default=EngineArgs.tokenizer_mode,
+ choices=['auto', 'slow'],
+ help='tokenizer mode. "auto" will use the fast '
+ 'tokenizer if available, and "slow" will '
+ 'always use the slow tokenizer.')
+ parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface')
+ parser.add_argument('--download-dir',
+ type=str,
+ default=EngineArgs.download_dir,
+ help='directory to download and load the weights, '
+ 'default to the default cache dir of '
+ 'huggingface')
+ parser.add_argument('--load-format',
+ type=str,
+ default=EngineArgs.load_format,
+ choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
+ help='The format of the model weights to load. '
+ '"auto" will try to load the weights in the safetensors format '
+ 'and fall back to the pytorch bin format if safetensors format '
+ 'is not available. '
+ '"pt" will load the weights in the pytorch bin format. '
+ '"safetensors" will load the weights in the safetensors format. '
+ '"npcache" will load the weights in pytorch format and store '
+ 'a numpy cache to speed up the loading. '
+ '"dummy" will initialize the weights with random values, '
+ 'which is mainly for profiling.')
+ parser.add_argument('--dtype',
+ type=str,
+ default=EngineArgs.dtype,
+ choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+ help='data type for model weights and activations. '
+ 'The "auto" option will use FP16 precision '
+ 'for FP32 and FP16 models, and BF16 precision '
+ 'for BF16 models.')
+ parser.add_argument('--max-model-len',
+ type=int,
+ default=None,
+ help='model context length. If unspecified, '
+ 'will be automatically derived from the model.')
+ # Parallel arguments
+ parser.add_argument('--worker-use-ray',
+ action='store_true',
+ help='use Ray for distributed serving, will be '
+ 'automatically set when using more than 1 GPU')
+ parser.add_argument('--pipeline-parallel-size',
+ '-pp',
+ type=int,
+ default=EngineArgs.pipeline_parallel_size,
+ help='number of pipeline stages')
+ parser.add_argument('--tensor-parallel-size',
+ '-tp',
+ type=int,
+ default=EngineArgs.tensor_parallel_size,
+ help='number of tensor parallel replicas')
+ # KV cache arguments
+ parser.add_argument('--block-size',
+ type=int,
+ default=EngineArgs.block_size,
+ choices=[8, 16, 32],
+ help='token block size')
+ # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
+ parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed')
+ parser.add_argument('--swap-space',
+ type=int,
+ default=EngineArgs.swap_space,
+ help='CPU swap space size (GiB) per GPU')
+ parser.add_argument('--gpu-memory-utilization',
+ type=float,
+ default=EngineArgs.gpu_memory_utilization,
+ help='the percentage of GPU memory to be used for'
+ 'the model executor')
+ parser.add_argument('--max-num-batched-tokens',
+ type=int,
+ default=EngineArgs.max_num_batched_tokens,
+ help='maximum number of batched tokens per '
+ 'iteration')
+ parser.add_argument('--max-num-seqs',
+ type=int,
+ default=EngineArgs.max_num_seqs,
+ help='maximum number of sequences per iteration')
+ parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics')
+ # Quantization settings.
+ parser.add_argument('--quantization',
+ '-q',
+ type=str,
+ choices=['awq', None],
+ default=None,
+ help='Method used to quantize the weights')
+ return parser
+
+ @classmethod
+ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
+ # Get the list of attributes of this dataclass.
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
+ # Set the attributes from the parsed arguments.
+ engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+ return engine_args
+
+ def create_engine_config(
+ self,
+ ) -> EngineConfig:
+ # bitsandbytes quantization needs a specific model loader
+ # so we make sure the quant method and the load format are consistent
+ if (self.quantization == "bitsandbytes" or
+ self.qlora_adapter_name_or_path is not None) and \
+ self.load_format != "bitsandbytes":
+ raise ValueError("BitsAndBytes quantization and QLoRA adapter only support "
+ f"'bitsandbytes' load format, but got {self.load_format}")
+
+ if (self.load_format == "bitsandbytes" or
+ self.qlora_adapter_name_or_path is not None) and \
+ self.quantization != "bitsandbytes":
+ raise ValueError("BitsAndBytes load format and QLoRA adapter only support "
+ f"'bitsandbytes' quantization, but got {self.quantization}")
+
+ assert self.cpu_offload_gb >= 0, ("CPU offload space must be non-negative"
+ f", but got {self.cpu_offload_gb}")
+
+ multimodal_config = MultiModalConfig()
+ device_config = DeviceConfig(self.device)
+ # NOTE(sgm): we only modify ModelConfig, other configs are import from vllm
+ model_config = ModelConfig(hf_config=self.model_hf_config,
+ tokenizer_mode=self.tokenizer_mode,
+ trust_remote_code=self.trust_remote_code,
+ dtype=self.dtype,
+ seed=self.seed,
+ revision=self.revision,
+ code_revision=self.code_revision,
+ rope_scaling=self.rope_scaling,
+ rope_theta=self.rope_theta,
+ tokenizer_revision=self.tokenizer_revision,
+ max_model_len=self.max_model_len,
+ quantization=self.quantization,
+ quantization_param_path=self.quantization_param_path,
+ enforce_eager=self.enforce_eager,
+ max_context_len_to_capture=self.max_context_len_to_capture,
+ max_seq_len_to_capture=self.max_seq_len_to_capture,
+ max_logprobs=self.max_logprobs,
+ disable_sliding_window=self.disable_sliding_window,
+ skip_tokenizer_init=self.skip_tokenizer_init,
+ served_model_name=self.served_model_name,
+ multimodal_config=multimodal_config)
+ cache_config = CacheConfig(
+ block_size=self.block_size,
+ gpu_memory_utilization=self.gpu_memory_utilization,
+ swap_space=self.swap_space,
+ cache_dtype=self.kv_cache_dtype,
+ num_gpu_blocks_override=self.num_gpu_blocks_override,
+ sliding_window=model_config.get_sliding_window(),
+ enable_prefix_caching=self.enable_prefix_caching,
+ cpu_offload_gb=self.cpu_offload_gb,
+ )
+ parallel_config = ParallelConfig(pipeline_parallel_size=self.pipeline_parallel_size,
+ tensor_parallel_size=self.tensor_parallel_size,
+ worker_use_ray=self.worker_use_ray,
+ max_parallel_loading_workers=self.max_parallel_loading_workers,
+ disable_custom_all_reduce=self.disable_custom_all_reduce,
+ tokenizer_pool_config=TokenizerPoolConfig.create_config(
+ self.tokenizer_pool_size,
+ self.tokenizer_pool_type,
+ self.tokenizer_pool_extra_config,
+ ),
+ ray_workers_use_nsight=self.ray_workers_use_nsight,
+ distributed_executor_backend=self.distributed_executor_backend)
+
+ # NOTE[VERL]: Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ parallel_config.world_size = world_size
+
+ max_model_len = model_config.max_model_len
+ use_long_context = max_model_len > 32768
+ if self.enable_chunked_prefill is None:
+ # If not explicitly set, enable chunked prefill by default for
+ # long context (> 32K) models. This is to avoid OOM errors in the
+ # initial memory profiling phase.
+ if use_long_context:
+ is_gpu = device_config.device_type == "cuda"
+ use_sliding_window = (model_config.get_sliding_window() is not None)
+ use_spec_decode = self.speculative_model is not None
+ has_seqlen_agnostic_layers = (model_config.contains_seqlen_agnostic_layers(parallel_config))
+ if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and
+ not self.enable_prompt_adapter and not self.enable_prefix_caching and
+ not has_seqlen_agnostic_layers):
+ self.enable_chunked_prefill = True
+ logger.warning("Chunked prefill is enabled by default for models with "
+ "max_model_len > 32K. Currently, chunked prefill might "
+ "not work with some features or models. If you "
+ "encounter any issues, please disable chunked prefill "
+ "by setting --enable-chunked-prefill=False.")
+ if self.enable_chunked_prefill is None:
+ self.enable_chunked_prefill = False
+
+ if not self.enable_chunked_prefill and use_long_context:
+ logger.warning(
+ "The model has a long context length (%s). This may cause OOM "
+ "errors during the initial memory profiling phase, or result "
+ "in low performance due to small KV cache space. Consider "
+ "setting --max-model-len to a smaller value.", max_model_len)
+
+ # TODO: spec config
+ speculative_config = SpeculativeConfig.maybe_create_spec_config(
+ target_model_config=model_config,
+ target_parallel_config=parallel_config,
+ target_dtype=self.dtype,
+ speculative_model=self.speculative_model,
+ speculative_draft_tensor_parallel_size = \
+ self.speculative_draft_tensor_parallel_size,
+ num_speculative_tokens=self.num_speculative_tokens,
+ speculative_disable_by_batch_size=self.
+ speculative_disable_by_batch_size,
+ speculative_max_model_len=self.speculative_max_model_len,
+ enable_chunked_prefill=self.enable_chunked_prefill,
+ use_v2_block_manager=self.use_v2_block_manager,
+ disable_log_stats=self.disable_log_stats,
+ ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
+ ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
+ draft_token_acceptance_method=\
+ self.spec_decoding_acceptance_method,
+ typical_acceptance_sampler_posterior_threshold=self.
+ typical_acceptance_sampler_posterior_threshold,
+ typical_acceptance_sampler_posterior_alpha=self.
+ typical_acceptance_sampler_posterior_alpha,
+ disable_logprobs=self.disable_logprobs_during_spec_decoding,
+ )
+
+ scheduler_config = SchedulerConfig(
+ max_num_batched_tokens=self.max_num_batched_tokens,
+ max_num_seqs=self.max_num_seqs,
+ max_model_len=model_config.max_model_len,
+ use_v2_block_manager=self.use_v2_block_manager,
+ num_lookahead_slots=(self.num_lookahead_slots
+ if speculative_config is None else speculative_config.num_lookahead_slots),
+ delay_factor=self.scheduler_delay_factor,
+ enable_chunked_prefill=self.enable_chunked_prefill,
+ embedding_mode=model_config.embedding_mode,
+ preemption_mode=self.preemption_mode,
+ )
+ lora_config = LoRAConfig(max_lora_rank=self.max_lora_rank,
+ max_loras=self.max_loras,
+ fully_sharded_loras=self.fully_sharded_loras,
+ lora_extra_vocab_size=self.lora_extra_vocab_size,
+ long_lora_scaling_factors=self.long_lora_scaling_factors,
+ lora_dtype=self.lora_dtype,
+ max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else
+ None) if self.enable_lora else None
+
+ if self.qlora_adapter_name_or_path is not None and \
+ self.qlora_adapter_name_or_path != "":
+ if self.model_loader_extra_config is None:
+ self.model_loader_extra_config = {}
+ self.model_loader_extra_config["qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
+
+ load_config = LoadConfig(
+ load_format=self.load_format,
+ download_dir=self.download_dir,
+ model_loader_extra_config=self.model_loader_extra_config,
+ ignore_patterns=self.ignore_patterns,
+ )
+
+ prompt_adapter_config = PromptAdapterConfig(
+ max_prompt_adapters=self.max_prompt_adapters,
+ max_prompt_adapter_token=self.max_prompt_adapter_token) \
+ if self.enable_prompt_adapter else None
+
+ decoding_config = DecodingConfig(guided_decoding_backend=self.guided_decoding_backend)
+
+ observability_config = ObservabilityConfig(otlp_traces_endpoint=self.otlp_traces_endpoint)
+
+ if (model_config.get_sliding_window() is not None and scheduler_config.chunked_prefill_enabled and
+ not scheduler_config.use_v2_block_manager):
+ raise ValueError("Chunked prefill is not supported with sliding window. "
+ "Set --disable-sliding-window to disable sliding window.")
+
+ return EngineConfig(
+ model_config=model_config,
+ cache_config=cache_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ multimodal_config=multimodal_config,
+ speculative_config=speculative_config,
+ load_config=load_config,
+ decoding_config=decoding_config,
+ observability_config=observability_config,
+ prompt_adapter_config=prompt_adapter_config,
+ )
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/config.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc61e6fe60661d7b5c4bfc77b5c1d3843997e46
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/config.py
@@ -0,0 +1,246 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
+
+import enum
+import json
+from typing import List, Optional, Union
+from dataclasses import dataclass, field, fields
+
+import torch
+from transformers import PretrainedConfig
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import get_quantization_config
+from vllm.transformers_utils.config import get_hf_text_config
+from vllm.utils import is_hip, print_warning_once
+# Add for verl
+from vllm.config import ModelConfig, _get_and_verify_dtype, _get_and_verify_max_len, get_served_model_name
+
+GPTQMarlinConfig = get_quantization_config("gptq_marlin")
+
+logger = init_logger(__name__)
+
+_GB = 1 << 30
+
+
+class ModelConfig(ModelConfig):
+ """Configuration for the model.
+
+ Args:
+ model: Name or path of the huggingface model to use.
+ tokenizer: Name or path of the huggingface tokenizer to use.
+ tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
+ available, and "slow" will always use the slow tokenizer.
+ trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+ downloading the model and tokenizer.
+ download_dir: Directory to download and load the weights, default to the
+ default cache directory of huggingface.
+ load_format: The format of the model weights to load:
+ "auto" will try to load the weights in the safetensors format and
+ fall back to the pytorch bin format if safetensors format is
+ not available.
+ "pt" will load the weights in the pytorch bin format.
+ "safetensors" will load the weights in the safetensors format.
+ "npcache" will load the weights in pytorch format and store
+ a numpy cache to speed up the loading.
+ "dummy" will initialize the weights with random values, which is
+ mainly for profiling.
+ dtype: Data type for model weights and activations. The "auto" option
+ will use FP16 precision for FP32 and FP16 models, and BF16 precision
+ for BF16 models.
+ seed: Random seed for reproducibility.
+ revision: The specific model version to use. It can be a branch name,
+ a tag name, or a commit id. If unspecified, will use the default
+ version.
+ code_revision: The specific revision to use for the model code on
+ Hugging Face Hub. It can be a branch name, a tag name, or a
+ commit id. If unspecified, will use the default version.
+ tokenizer_revision: The specific tokenizer version to use. It can be a
+ branch name, a tag name, or a commit id. If unspecified, will use
+ the default version.
+ max_model_len: Maximum length of a sequence (including prompt and
+ output). If None, will be derived from the model.
+ quantization: Quantization method that was used to quantize the model
+ weights. If None, we assume the model weights are not quantized.
+ quantization_param_path: Path to JSON file containing scaling factors.
+ Used to load KV cache scaling factors into the model when KV cache
+ type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
+ be used to load activation and weight scaling factors when the
+ model dtype is FP8_E4M3 on ROCm.
+ enforce_eager: Whether to enforce eager execution. If True, we will
+ disable CUDA graph and always execute the model in eager mode.
+ If False, we will use CUDA graph and eager execution in hybrid.
+ max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
+ max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode
+ skip_tokenizer_init: If true, skip initialization of tokenizer and
+ detokenizer.
+ served_model_name: The model name used in metrics tag `model_name`,
+ matches the model name exposed via the APIs. If multiple model
+ names provided, the first name will be used. If not specified,
+ the model name will be the same as `model`.
+ """
+
+ def __init__(
+ self,
+ hf_config: PretrainedConfig,
+ tokenizer_mode: str,
+ trust_remote_code: bool,
+ dtype: Union[str, torch.dtype],
+ seed: int,
+ revision: Optional[str] = None,
+ code_revision: Optional[str] = None,
+ rope_scaling: Optional[dict] = None,
+ rope_theta: Optional[float] = None,
+ tokenizer_revision: Optional[str] = None,
+ max_model_len: Optional[int] = None,
+ quantization: Optional[str] = None,
+ quantization_param_path: Optional[str] = None,
+ enforce_eager: bool = False,
+ max_context_len_to_capture: Optional[int] = None,
+ max_seq_len_to_capture: Optional[int] = None,
+ max_logprobs: int = 20,
+ disable_sliding_window: bool = False,
+ skip_tokenizer_init: bool = False,
+ served_model_name: Optional[Union[str, List[str]]] = None,
+ multimodal_config: Optional["MultiModalConfig"] = None,
+ ) -> None:
+ self.model = hf_config._name_or_path
+ self.tokenizer = hf_config._name_or_path
+ # NOTE(sgm): same as open-sourced
+ self.tokenizer_mode = tokenizer_mode
+ self.trust_remote_code = trust_remote_code
+ self.seed = seed
+ self.revision = revision
+ self.code_revision = code_revision
+ self.rope_scaling = rope_scaling
+ self.rope_theta = rope_theta
+ # The tokenizer version is consistent with the model version by default.
+ if tokenizer_revision is None:
+ self.tokenizer_revision = revision
+ else:
+ self.tokenizer_revision = tokenizer_revision
+ self.quantization = quantization
+ self.quantization_param_path = quantization_param_path
+ self.enforce_eager = enforce_eager
+ if max_context_len_to_capture is not None:
+ raise ValueError("`max_context_len_to_capture` is deprecated. "
+ "Use `max_seq_len_to_capture` instead.")
+ self.max_seq_len_to_capture = max_seq_len_to_capture
+ self.max_logprobs = max_logprobs
+ self.disable_sliding_window = disable_sliding_window
+ self.skip_tokenizer_init = skip_tokenizer_init
+
+ # self.hf_config = get_config(model, trust_remote_code, revision)
+ self.hf_config = hf_config
+ self.hf_text_config = get_hf_text_config(hf_config)
+ self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+ # self.served_model_name = get_served_model_name(model,
+ # served_model_name)
+ # self._verify_load_format()
+ # self._verify_tokenizer_mode()
+ if (not self.disable_sliding_window and self.hf_text_config.model_type == "gemma2" and
+ self.hf_text_config.sliding_window is not None):
+ print_warning_once("Gemma 2 uses sliding window attention for every odd layer, "
+ "which is currently not supported by vLLM. Disabling sliding "
+ "window and capping the max length to the sliding window size "
+ f"({self.hf_text_config.sliding_window}).")
+ self.disable_sliding_window = True
+
+ self.max_model_len = _get_and_verify_max_len(hf_config=self.hf_text_config,
+ max_model_len=max_model_len,
+ disable_sliding_window=self.disable_sliding_window,
+ sliding_window_len=self.get_hf_config_sliding_window())
+ self.served_model_name = get_served_model_name(
+ self.model, # str
+ served_model_name)
+ self.multimodal_config = multimodal_config
+
+ if not self.skip_tokenizer_init:
+ self._verify_tokenizer_mode()
+ self._verify_embedding_mode()
+ self._verify_quantization()
+ self._verify_cuda_graph()
+
+
+class LoadFormat(str, enum.Enum):
+ AUTO = 'auto'
+ MEGATRON = "megatron"
+ HF = "hf"
+ DTENSOR = 'dtensor'
+ DUMMY_HF = 'dummy_hf'
+ DUMMY_MEGATRON = 'dummy_megatron'
+ DUMMY_DTENSOR = 'dummy_dtensor'
+
+
+# TODO: check whether this is necessary
+@dataclass
+class LoadConfig:
+ """
+ download_dir: Directory to download and load the weights, default to the
+ default cache directory of huggingface.
+ load_format: The format of the model weights to load:
+ "auto" will try to load the weights in the safetensors format and
+ fall back to the pytorch bin format if safetensors format is
+ not available.
+ "pt" will load the weights in the pytorch bin format.
+ "safetensors" will load the weights in the safetensors format.
+ "npcache" will load the weights in pytorch format and store
+ a numpy cache to speed up the loading.
+ "dummy" will initialize the weights with random values, which is
+ mainly for profiling.
+ "tensorizer" will use CoreWeave's tensorizer library for
+ fast weight loading.
+ "bitsandbytes" will load nf4 type weights.
+ ignore_patterns: The list of patterns to ignore when loading the model.
+ Default to "original/**/*" to avoid repeated loading of llama's
+ checkpoints.
+
+ """
+
+ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+ download_dir: Optional[str] = None
+ model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
+ ignore_patterns: Optional[Union[List[str], str]] = None
+
+ def __post_init__(self):
+ model_loader_extra_config = self.model_loader_extra_config or {}
+ if isinstance(model_loader_extra_config, str):
+ self.model_loader_extra_config = json.loads(model_loader_extra_config)
+ self._verify_load_format()
+
+ if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+ logger.info("Ignoring the following patterns when downloading weights: %s", self.ignore_patterns)
+ else:
+ self.ignore_patterns = ["original/**/*"]
+
+ def _verify_load_format(self) -> None:
+ if not isinstance(self.load_format, str):
+ return
+
+ load_format = self.load_format.lower()
+ self.load_format = LoadFormat(load_format)
+
+ rocm_not_supported_load_format: List[str] = []
+ if is_hip() and load_format in rocm_not_supported_load_format:
+ rocm_supported_load_format = [
+ f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format)
+ ]
+ raise ValueError(f"load format '{load_format}' is not supported in ROCm. "
+ f"Supported load formats are "
+ f"{rocm_supported_load_format}")
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..732b543db6347d2f22db22745a3a7c037636737e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py
@@ -0,0 +1,340 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+
+from typing import Dict, Iterable, Tuple
+import torch
+import torch.nn as nn
+from torch.distributed._tensor import DTensor, Shard, Replicate
+
+from vllm.model_executor.layers.linear import *
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+
+def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ for (param_name, shard_name, shard_id) in stacked_params_mapping:
+ if shard_name not in name:
+ continue
+ stacked_name = name.replace(shard_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if stacked_name.endswith(".bias") and stacked_name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[stacked_name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # lm_head is not used in vllm as it is tied with embed_token.
+ # To prevent errors, skip loading lm_head.weight.
+ if "lm_head.weight" in name:
+ continue
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def gptbigcode_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "lm_head.weight" in name:
+ continue
+ if ".attn.bias" in name:
+ # Skip attention mask.
+ # NOTE: "c_attn.bias" should not be skipped.
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def starcoder2_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ]
+
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def llama_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ (".qkv_proj", ".q_proj", "q"),
+ (".qkv_proj", ".k_proj", "k"),
+ (".qkv_proj", ".v_proj", "v"),
+ (".gate_up_proj", ".gate_proj", 0),
+ (".gate_up_proj", ".up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if ("rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name):
+ # Models trained using ColossalAI may include these tensors in
+ # the checkpoint. Skip them.
+ continue
+ # With tie_word_embeddings, we can skip lm_head.weight
+ # The weight might appear unnecessarily in the files if the model is
+ # processed with quantization, LoRA, fine-tuning, etc.
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight)
+
+
+def qwen2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+from vllm.model_executor.layers.fused_moe import FusedMoE
+
+
+def deepseekv2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+
+ # Params for weights, fp8 weight scales, fp8 activation scales
+ # (param_name, weight_name, expert_id, shard_id)
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(ckpt_gate_proj_name="gate_proj",
+ ckpt_down_proj_name="down_proj",
+ ckpt_up_proj_name="up_proj",
+ num_experts=vllm_model.config.n_routed_experts)
+
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ # Skip non-stacked layers and experts (experts handled below).
+ if weight_name not in name:
+ continue
+ # We have mlp.experts[0].gate_proj in the checkpoint.
+ # Since we handle the experts below in expert_params_mapping,
+ # we need to skip here BEFORE we update the name, otherwise
+ # name will be updated to mlp.experts[0].gate_up_proj, which
+ # will then be updated below in expert_params_mapping
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+ if (("mlp.experts." in name) and name not in params_dict):
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+
+ if is_pp_missing_parameter(name, vllm_model):
+ continue
+
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ for mapping in expert_params_mapping:
+ param_name, weight_name, expert_id, shard_id = mapping
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+
+ if is_pp_missing_parameter(name, vllm_model):
+ continue
+
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param,
+ local_loaded_weight.to(dtype=param.dtype),
+ weight_name,
+ shard_id=shard_id,
+ expert_id=expert_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+
+ if is_pp_missing_parameter(name, vllm_model):
+ continue
+
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def gpt2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ pass
+
+
+def redistribute_dtensor(param_name: str, loaded_weights: DTensor, parallelize_plan: Dict = None):
+ param_name = _process_parameter_names(name=param_name)
+ if parallelize_plan is not None:
+ assert param_name in parallelize_plan.keys(), \
+ f"param name: {param_name} not in parallelize_plan :{parallelize_plan.keys()}"
+ placement = parallelize_plan[param_name]
+ local_loaded_weights = loaded_weights.redistribute(device_mesh=loaded_weights.device_mesh,
+ placements=placement).to_local()
+ else:
+ local_loaded_weights = loaded_weights.full_tensor()
+ return local_loaded_weights
+
+
+def _process_parameter_names(name):
+ # Remove '.weight' if it exists at the end of the string
+ if name.endswith(".weight"):
+ name = name[:-7]
+
+ # Remove 'model.layers.x.' or 'model.' prefix
+ if "model.layers" in name:
+ parts = name.split('.')
+ # Reconstruct the string without 'model.layers.x.'
+ name = '.'.join(parts[3:]) # parts[0] is 'model', parts[1] is 'layers', parts[2] is 'x'
+ elif name.startswith("model."):
+ name = name[6:] # Remove 'model.'
+
+ return name
+
+
+__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__ = {
+ 'GPT2LMHeadModel': gpt2_dtensor_weight_loader,
+ 'LlamaForCausalLM': llama_dtensor_weight_loader,
+ 'LLaMAForCausalLM': llama_dtensor_weight_loader,
+ 'MistralForCausalLM': llama_dtensor_weight_loader, # mistral is the same as llama in vLLM
+ 'InternLMForCausalLM': llama_dtensor_weight_loader,
+ 'AquilaModel': llama_dtensor_weight_loader,
+ 'AquilaForCausalLM': llama_dtensor_weight_loader,
+ 'Phi3ForCausalLM': llama_dtensor_weight_loader,
+ 'GemmaForCausalLM': gemma_dtensor_weight_loader,
+ 'Gemma2ForCausalLM': gemma_dtensor_weight_loader,
+ 'GPTBigCodeForCausalLM': gptbigcode_dtensor_load_weights,
+ 'Starcoder2ForCausalLM': starcoder2_dtensor_load_weights,
+ 'Qwen2ForCausalLM': qwen2_dtensor_weight_loader,
+ 'DeepseekV2ForCausalLM': deepseekv2_dtensor_weight_loader
+}
+
+
+# the actor model is .state_dict()
+# Load dtensor weights
+def load_dtensor_weights(actor_weights: Dict, vllm_model: nn.Module):
+ weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
+ weight_loader(actor_weights, vllm_model)
+ # NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
+ # after init, and we need this after sync model weights for in first iter.
+ vllm_model = vllm_model.cuda()
+
+
+def _get_model_weight_loader(arch: str):
+ if arch in __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__:
+ return __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__.keys()}")
+
+
+# NOTE(sgm): we use per-parameter weight loader in each vllm sub
+def update_dtensor_weight_loader():
+ pass
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af4953f35e7107c9e7e6cd4f597b4a2715d441d
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py
@@ -0,0 +1,44 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+
+from typing import Dict, Union, Optional, Iterable, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+
+def update_hf_weight_loader():
+ print('no hf weight loader need to be updated')
+ return
+
+
+def load_hf_weights(actor_weights: Dict, vllm_model: nn.Module):
+ assert isinstance(actor_weights, Dict)
+ with set_default_torch_dtype(next(vllm_model.parameters()).dtype): # TODO
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in actor_weights.keys():
+ del actor_weights["lm_head.weight"]
+ vllm_model.load_weights(actor_weights.items())
+ for _, module in vllm_model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ vllm_model = vllm_model.cuda()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/llm.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f56f1e07af01e646e3a096e8a3b931a43dc3747
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/llm.py
@@ -0,0 +1,239 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
+
+from contextlib import contextmanager
+from typing import ClassVar, List, Optional, Sequence, Union, cast, overload, Dict, Tuple
+
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import PretrainedConfig
+import torch.nn as nn
+from .arg_utils import EngineArgs
+from .llm_engine_sp import LLMEngine
+from vllm import LLM
+from vllm.inputs import (PromptInputs, TextPrompt, TokensPrompt, parse_and_batch_prompt)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding import (GuidedDecodingRequest, get_local_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.guided_fields import LLMGuidedOptions
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Counter, deprecate_kwargs
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from verl.workers.rollout.tokenizer import HybridEngineBaseTokenizer
+
+
+class LLM(LLM):
+ """An LLM for generating texts from given prompts and sampling parameters.
+
+ This class includes a tokenizer, a language model (possibly distributed
+ across multiple GPUs), and GPU memory space allocated for intermediate
+ states (aka KV cache). Given a batch of prompts and sampling parameters,
+ this class generates texts from the model, using an intelligent batching
+ mechanism and efficient memory management.
+
+ NOTE: This class is intended to be used for offline inference. For online
+ serving, use the `AsyncLLMEngine` class instead.
+ NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+
+ Args:
+ model: A HuggingFace Transformers model instance.
+ tokenizer: A HuggingFace Transformers tokenizer instance.
+ tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+ if available, and "slow" will always use the slow tokenizer.
+ trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+ downloading the model and tokenizer.
+ tensor_parallel_size: The number of GPUs to use for distributed
+ execution with tensor parallelism.
+ dtype: The data type for the model weights and activations. Currently,
+ we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+ the `torch_dtype` attribute specified in the model config file.
+ However, if the `torch_dtype` in the config is `float32`, we will
+ use `float16` instead.
+ quantization: The method used to quantize the model weights. Currently,
+ we support "awq". If None, we assume the model weights are not
+ quantized and use `dtype` to determine the data type of the weights.
+ revision: The specific model version to use. It can be a branch name,
+ a tag name, or a commit id.
+ tokenizer_revision: The specific tokenizer version to use. It can be a
+ branch name, a tag name, or a commit id.
+ seed: The seed to initialize the random number generator for sampling.
+ gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+ reserve for the model weights, activations, and KV cache. Higher
+ values will increase the KV cache size and thus improve the model's
+ throughput. However, if the value is too high, it may cause out-of-
+ memory (OOM) errors.
+ swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+ This can be used for temporarily storing the states of the requests
+ when their `best_of` sampling parameters are larger than 1. If all
+ requests will have `best_of=1`, you can safely set this to 0.
+ Otherwise, too small values may cause out-of-memory (OOM) errors.
+ enforce_eager: Whether to enforce eager execution. If True, we will
+ disable CUDA graph and always execute the model in eager mode.
+ If False, we will use CUDA graph and eager execution in hybrid.
+ max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode.
+ disable_custom_all_reduce: See ParallelConfig
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
+ model_hf_config: PretrainedConfig,
+ tokenizer_mode: str = "auto",
+ trust_remote_code: bool = False,
+ skip_tokenizer_init: bool = False,
+ tensor_parallel_size: int = 1,
+ dtype: str = "auto",
+ quantization: Optional[str] = None,
+ revision: Optional[str] = None,
+ tokenizer_revision: Optional[str] = None,
+ seed: int = 0,
+ gpu_memory_utilization: float = 0.9,
+ swap_space: int = 4,
+ cpu_offload_gb: float = 0,
+ enforce_eager: bool = False,
+ max_context_len_to_capture: Optional[int] = None,
+ max_seq_len_to_capture: int = 8192,
+ disable_custom_all_reduce: bool = False,
+ load_format = 'auto',
+ **kwargs,
+ ) -> None:
+ if "disable_log_stats" not in kwargs:
+ kwargs["disable_log_stats"] = True
+ engine_args = EngineArgs(
+ model_hf_config=model_hf_config,
+ tensor_parallel_size=tensor_parallel_size,
+ dtype=dtype,
+ quantization=quantization,
+ revision=revision,
+ tokenizer_revision=tokenizer_revision,
+ seed=seed,
+ gpu_memory_utilization=gpu_memory_utilization,
+ swap_space=swap_space,
+ cpu_offload_gb=cpu_offload_gb,
+ enforce_eager=enforce_eager,
+ max_context_len_to_capture=max_context_len_to_capture,
+ max_seq_len_to_capture=max_seq_len_to_capture,
+ disable_custom_all_reduce=disable_custom_all_reduce,
+ load_format=load_format,
+ skip_tokenizer_init=skip_tokenizer_init,
+ **kwargs,
+ )
+ tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
+ if not isinstance(tokenizer, tokenizer_cls):
+ raise ValueError(
+ f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
+ "one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
+ )
+ self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args) # TODO: check usagecontext
+ self.request_counter = Counter()
+
+ def init_cache_engine(self):
+ self.llm_engine.init_cache_engine()
+
+ def free_cache_engine(self):
+ self.llm_engine.free_cache_engine()
+
+ def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+ return self.llm_engine.tokenizer
+
+ def set_tokenizer(
+ self,
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+ ) -> None:
+ self.llm_engine.tokenizer = tokenizer
+
+ def _run_engine(self, *, use_tqdm: bool) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+ # Initialize tqdm.
+ if use_tqdm:
+ num_requests = self.llm_engine.get_num_unfinished_requests()
+ pbar = tqdm(
+ total=num_requests,
+ desc="Processed prompts",
+ dynamic_ncols=True,
+ postfix=(f"est. speed input: {0:.2f} toks/s, "
+ f"output: {0:.2f} toks/s"),
+ )
+ # Run the engine.
+ outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
+ total_in_toks = 0
+ total_out_toks = 0
+ while self.llm_engine.has_unfinished_requests():
+ step_outputs = self.llm_engine.step()
+ for output in step_outputs:
+ if output.finished:
+ outputs.append(output)
+ if use_tqdm:
+ if isinstance(output, RequestOutput):
+ # Calculate tokens only for RequestOutput
+ total_in_toks += len(output.prompt_token_ids)
+ in_spd = total_in_toks / pbar.format_dict["elapsed"]
+ total_out_toks += sum(len(stp.token_ids) for stp in output.outputs)
+ out_spd = total_out_toks / pbar.format_dict["elapsed"]
+ pbar.postfix = (f"est. speed input: {in_spd:.2f} toks/s, "
+ f"output: {out_spd:.2f} toks/s")
+ pbar.update(1)
+ if use_tqdm:
+ pbar.close()
+ # Sort the outputs by request ID.
+ # This is necessary because some requests may be finished earlier than
+ # its previous requests.
+ outputs = sorted(outputs, key=lambda x: int(x.request_id))
+ return self._post_process_outputs(outputs)
+
+ # # NOTE(shengguangming): add for verl
+ # # TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
+ # def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
+ # # remove the left padding in the prompt token_id
+ # pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ # non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+ # token_ids = prompt_token_ids[non_pad_index:].tolist()
+ # return token_ids
+
+ # NOTE(shengguangming): add for verl
+ def _post_process_outputs(self, request_outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
+ output_token_ids = []
+ logprobs = []
+ for request_output in request_outputs: # List[RequestOutput]
+ outputs = request_output.outputs
+ for output in outputs: # List[CompletionOutput], usually len == 1
+ output_token_ids.append(torch.tensor(output.token_ids))
+ # TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
+ logprobs_dicts = output.logprobs
+ if logprobs_dicts is not None:
+ logprob = []
+ for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
+ logprob.append(logprobs_dict[id].logprob)
+ logprobs.append(torch.tensor(logprob))
+
+ pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
+ if len(logprobs) > 0:
+ logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
+ return output_token_ids, logprobs
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.llm_engine.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+ def offload_model_weights(self) -> None:
+ self.llm_engine.offload_model_weights()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d161e747066ece4b19984dac4aecfa32cecf6e5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
@@ -0,0 +1,328 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
+
+import torch
+from typing import Dict, Optional, Union, Type
+
+import vllm.envs as envs
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoRAConfig, MultiModalConfig,
+ ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig)
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.executor.executor_base import ExecutorBase
+from vllm.inputs import INPUT_REGISTRY, LLMInputs, PromptInputs
+from vllm.logger import init_logger
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.engine.metrics import (LoggingStatLogger, PrometheusStatLogger, StatLoggerBase, Stats)
+from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, init_tracer)
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message)
+from vllm.utils import Counter
+from vllm.engine.llm_engine import _load_generation_config_dict
+from vllm.engine.llm_engine import LLMEngine
+from vllm.version import __version__ as VLLM_VERSION
+
+import torch.nn as nn
+from .arg_utils import EngineArgs
+from .tokenizer import TokenizerGroup
+from .config import ModelConfig, LoadConfig
+
+logger = init_logger(__name__)
+_LOCAL_LOGGING_INTERVAL_SEC = 5
+
+
+class LLMEngine(LLMEngine):
+ """An LLM engine that receives requests and generates texts.
+
+ This is the main class for the vLLM engine. It receives requests
+ from clients and generates texts from the LLM. It includes a tokenizer, a
+ language model (possibly distributed across multiple GPUs), and GPU memory
+ space allocated for intermediate states (aka KV cache). This class utilizes
+ iteration-level scheduling and efficient memory management to maximize the
+ serving throughput.
+
+ The `LLM` class wraps this class for offline batched inference and the
+ `AsyncLLMEngine` class wraps this class for online serving.
+
+ NOTE: The config arguments are derived from the `EngineArgs` class. For the
+ comprehensive list of arguments, see `EngineArgs`.
+
+ Args:
+ model: the actor model initialize outside vllm (add for verl)
+ tokenizer: the initialized tokenizer (add for verl)
+ model_config: The configuration related to the LLM model.
+ cache_config: The configuration related to the KV cache memory
+ management.
+ parallel_config: The configuration related to distributed execution.
+ scheduler_config: The configuration related to the request scheduler.
+ distributed_init_method: The initialization method for distributed
+ execution. See `torch.distributed.init_process_group` for details.
+ placement_group: Ray placement group for distributed execution.
+ Required for distributed execution.
+ log_stats: Whether to log statistics.
+ """
+
+ def __init__(
+ self,
+ # NOTE(sgm): first two arguments are added for verl
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: nn.Module,
+ # NOTE(sgm): vllm original arguments
+ model_config: ModelConfig,
+ cache_config: CacheConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ multimodal_config: Optional[MultiModalConfig],
+ speculative_config: Optional[SpeculativeConfig],
+ decoding_config: Optional[DecodingConfig],
+ observability_config: Optional[ObservabilityConfig],
+ prompt_adapter_config: Optional[PromptAdapterConfig],
+ executor_class: Type[ExecutorBase],
+ log_stats: bool,
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+ ) -> None:
+ logger.info(
+ "Initializing an LLM engine (v%s) with config: "
+ "model=%r, speculative_config=%r, tokenizer=%r, "
+ "skip_tokenizer_init=%s, revision=%s, "
+ "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
+ "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
+ "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
+ "pipeline_parallel_size=%d, "
+ "disable_custom_all_reduce=%s, quantization=%s, "
+ "enforce_eager=%s, kv_cache_dtype=%s, "
+ "quantization_param_path=%s, device_config=%s, "
+ "decoding_config=%r, observability_config=%r, "
+ "seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
+ "enable_prefix_caching=%s)",
+ VLLM_VERSION,
+ model_config.model,
+ speculative_config,
+ model_config.tokenizer,
+ model_config.skip_tokenizer_init,
+ model_config.revision,
+ model_config.rope_scaling,
+ model_config.rope_theta,
+ model_config.tokenizer_revision,
+ model_config.trust_remote_code,
+ model_config.dtype,
+ model_config.max_model_len,
+ load_config.download_dir,
+ load_config.load_format,
+ parallel_config.tensor_parallel_size,
+ parallel_config.pipeline_parallel_size,
+ parallel_config.disable_custom_all_reduce,
+ model_config.quantization,
+ model_config.enforce_eager,
+ cache_config.cache_dtype,
+ model_config.quantization_param_path,
+ device_config.device,
+ decoding_config,
+ observability_config,
+ model_config.seed,
+ model_config.served_model_name,
+ scheduler_config.use_v2_block_manager,
+ cache_config.enable_prefix_caching,
+ )
+ # TODO(woosuk): Print more configs in debug mode.
+
+ self.model_config = model_config
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ self.multimodal_config = multimodal_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.speculative_config = speculative_config
+ self.load_config = load_config
+ self.decoding_config = decoding_config or DecodingConfig()
+ self.prompt_adapter_config = prompt_adapter_config
+ self.observability_config = observability_config or ObservabilityConfig()
+ self.log_stats = log_stats
+
+ # self.model = model # should not store the model, it should be deleted
+ # TODO(shengguangming): maybe we can choose init here or from arguments
+ if not self.model_config.skip_tokenizer_init:
+ self.tokenizer = self._init_tokenizer(tokenizer)
+ self.detokenizer = Detokenizer(self.tokenizer)
+ else:
+ self.tokenizer = None
+ self.detokenizer = None
+
+ self.seq_counter = Counter()
+ self.generation_config_fields = _load_generation_config_dict(model_config)
+
+ self.input_processor = INPUT_REGISTRY.create_input_processor(self.model_config)
+
+ self.model_executor = executor_class(
+ model=model, # add for spmd_gpu_executor
+ model_config=model_config,
+ cache_config=cache_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ multimodal_config=multimodal_config,
+ speculative_config=speculative_config,
+ load_config=load_config,
+ prompt_adapter_config=prompt_adapter_config,
+ )
+
+ # Profile the memory usage and initialize the cache.
+ if not self.model_config.embedding_mode:
+ self._initialize_kv_caches()
+
+ # If usage stat is enabled, collect relevant info.
+ if is_usage_stats_enabled():
+ from vllm.model_executor.model_loader import (get_architecture_class_name)
+ usage_message.report_usage(
+ get_architecture_class_name(model_config),
+ usage_context,
+ extra_kvs={
+ # Common configuration
+ "dtype": str(model_config.dtype),
+ "tensor_parallel_size": parallel_config.tensor_parallel_size,
+ "block_size": cache_config.block_size,
+ "gpu_memory_utilization": cache_config.gpu_memory_utilization,
+
+ # Quantization
+ "quantization": model_config.quantization,
+ "kv_cache_dtype": str(cache_config.cache_dtype),
+
+ # Feature flags
+ "enable_lora": bool(lora_config),
+ "enable_prompt_adapter": bool(prompt_adapter_config),
+ "enable_prefix_caching": cache_config.enable_prefix_caching,
+ "enforce_eager": model_config.enforce_eager,
+ "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
+ })
+
+ if self.tokenizer:
+ # Ping the tokenizer to ensure liveness if it runs in a
+ # different process.
+ self.tokenizer.ping()
+
+ # Create the scheduler.
+ # NOTE: the cache_config here have been updated with the numbers of
+ # GPU and CPU blocks, which are profiled in the distributed executor.
+ self.scheduler = [
+ Scheduler(scheduler_config, cache_config, lora_config, parallel_config.pipeline_parallel_size)
+ for _ in range(parallel_config.pipeline_parallel_size)
+ ]
+
+ # Metric Logging.
+ if self.log_stats:
+ if stat_loggers is not None:
+ self.stat_loggers = stat_loggers
+ else:
+ self.stat_loggers = {
+ "logging":
+ LoggingStatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
+ "prometheus":
+ PrometheusStatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+ labels=dict(model_name=model_config.served_model_name),
+ max_model_len=self.model_config.max_model_len),
+ }
+ self.stat_loggers["prometheus"].info("cache_config", self.cache_config)
+
+ self.tracer = None
+ if self.observability_config.otlp_traces_endpoint:
+ self.tracer = init_tracer("vllm.llm_engine", self.observability_config.otlp_traces_endpoint)
+
+ # Create sequence output processor, e.g. for beam search or
+ # speculative decoding.
+ self.output_processor = (SequenceGroupOutputProcessor.create_output_processor(
+ self.scheduler_config,
+ self.detokenizer,
+ self.scheduler,
+ self.seq_counter,
+ self.get_tokenizer_for_seq,
+ stop_checker=StopChecker(
+ self.scheduler_config.max_model_len,
+ self.get_tokenizer_for_seq,
+ ),
+ ))
+
+ # TODO(sgm): add for verl but we may not tokenizer in Rollout
+ def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
+ init_kwargs = dict(enable_lora=bool(self.lora_config),
+ max_num_seqs=self.scheduler_config.max_num_seqs,
+ max_input_length=None)
+ init_kwargs.update(tokenizer_init_kwargs)
+ return TokenizerGroup(tokenizer, **init_kwargs)
+
+ def init_cache_engine(self):
+ # TODO: check whether we should rebuild the CUDAGraph every iter when offload/load KVCache
+ # Re-capture CUDAGraph would be time-consuming
+ self.model_executor.init_cache_engine()
+
+ def free_cache_engine(self):
+ self.model_executor.free_cache_engine()
+
+ # NOTE(sgm): currently, we only support GPU executor
+ # The GPUExecutor remove the Ray dependency
+ @classmethod
+ def _get_executor_cls(cls, engine_config: EngineConfig) -> Type[ExecutorBase]:
+ assert engine_config.device_config.device_type == "cuda", \
+ "Currently, the vllm in verl only support running on GPU"
+
+ if engine_config.parallel_config.world_size == 1:
+ engine_config.load_config.load_format = "dummy_hf"
+
+ from .spmd_gpu_executor import SPMDGPUExecutor
+ executor_class = SPMDGPUExecutor
+ return executor_class
+
+ @classmethod
+ def from_engine_args(
+ cls,
+ model,
+ tokenizer,
+ engine_args: EngineArgs,
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+ ) -> "LLMEngine":
+ """Creates an LLM engine from the engine arguments."""
+ # Create the engine configs.
+ engine_config = engine_args.create_engine_config()
+ executor_class = cls._get_executor_cls(engine_config)
+ # Initialize the cluster and specify the executor class.
+ assert engine_config.device_config.device_type == "cuda", \
+ "Currently, the vllm in verl only support running on GPU"
+
+ from .spmd_gpu_executor import SPMDGPUExecutor
+ executor_class = SPMDGPUExecutor
+
+ # Create the LLM engine.
+ engine = cls(
+ model,
+ tokenizer,
+ **engine_config.to_dict(),
+ executor_class=executor_class,
+ log_stats=not engine_args.disable_log_stats,
+ usage_context=usage_context,
+ stat_loggers=stat_loggers,
+ )
+ return engine
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.model_executor.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+ def offload_model_weights(self) -> None:
+ self.model_executor.offload_model_weights()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2b19a904e77a9c2d10e259d061f797da67ddd8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py
@@ -0,0 +1,307 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+
+from typing import Dict
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.layers.linear import *
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.activation import ScaledActivation
+from vllm.model_executor.models import ModelRegistry
+
+
+# NOTE(shengguangming): replace the origin weight loader function in the class
+def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Parallel Linear weight loader."""
+ assert param.size() == loaded_weight.size(
+ ), 'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'.format(
+ param.size(), loaded_weight.size())
+ assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Default weight loader."""
+ assert param.size() == loaded_weight.size()
+ assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "lm_head.weight" in name:
+ # GPT-2 ties the weights of the embedding layer and the final
+ # linear layer.
+ continue
+ if ".attn.bias" in name or ".attn.masked_bias" in name:
+ # Skip attention mask.
+ # NOTE: "c_attn.bias" should not be skipped.
+ continue
+ if not name.startswith("transformer."):
+ name = "transformer." + name
+ param = params_dict[name]
+ # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+ # Because of this, we need to transpose the weights.
+ # Note(zhuohan): the logic below might break quantized models.
+ for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+ if conv1d_weight_name not in name:
+ continue
+ if not name.endswith(".weight"):
+ continue
+ # TODO: check megatron
+ loaded_weight = loaded_weight.t()
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+ ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
+ ('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ (
+ 'input_layernorm',
+ 'input_layernorm',
+ ),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def _replace_name(megatron_name, name_mapping):
+ for m_name, v_name in name_mapping:
+ if m_name not in megatron_name:
+ continue
+ if 'layers' in megatron_name: # deal with decoder layers
+ megatron_name = megatron_name.replace('decoder', 'model')
+ megatron_name_list = megatron_name.split('.')
+ if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
+ param_name_list = megatron_name_list[:3]
+ param_name_list.append(v_name)
+ param_name = '.'.join(param_name_list)
+ else:
+ param_name_list = megatron_name_list[:3]
+ weight_or_bias = megatron_name_list[-1]
+ param_name_list.append(v_name)
+ param_name_list.append(weight_or_bias)
+ param_name = '.'.join(param_name_list)
+ return param_name
+ else:
+ param_name = megatron_name.replace(m_name, v_name)
+ return param_name
+
+
+def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+ ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
+ ('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", 'self_attn.o_proj'),
+ (
+ 'input_layernorm',
+ 'input_layernorm',
+ ),
+ ('pre_mlp_layernorm', 'post_attention_layernorm'),
+ ('mlp.linear_fc1', 'mlp.gate_up_proj'),
+ ('mlp.linear_fc2', 'mlp.down_proj'),
+ ('decoder.final_layernorm', 'model.norm'),
+ ('output_layer', 'lm_head'),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith('.bias') and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def _replace_name(megatron_name, name_mapping):
+ for m_name, v_name in name_mapping:
+ if m_name not in megatron_name:
+ continue
+ if 'layers' in megatron_name: # deal with decoder layers
+ megatron_name = megatron_name.replace('decoder', 'model')
+ megatron_name_list = megatron_name.split('.')
+ if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
+ param_name_list = megatron_name_list[:3]
+ param_name_list.append(v_name)
+ param_name = '.'.join(param_name_list)
+ else:
+ param_name_list = megatron_name_list[:3]
+ weight_or_bias = megatron_name_list[-1]
+ param_name_list.append(v_name)
+ param_name_list.append(weight_or_bias)
+ param_name = '.'.join(param_name_list)
+ return param_name
+ else:
+ param_name = megatron_name.replace(m_name, v_name)
+ return param_name
+
+
+def mistral_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # TODO: need to implement a general way to deal with prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__ = {
+ ColumnParallelLinear: parallel_weight_loader,
+ MergedColumnParallelLinear: parallel_weight_loader,
+ QKVParallelLinear: parallel_weight_loader,
+ RowParallelLinear: parallel_weight_loader,
+ VocabParallelEmbedding: parallel_weight_loader,
+ ParallelLMHead: parallel_weight_loader
+ # "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
+ # "default_weight_loader": default_weight_loader
+}
+
+# for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
+# # setattr(layer_class, 'megatron_weight_loader', weight_loader)
+# layer_class.weight_loader = weight_loader
+
+__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__ = {
+ 'GPT2LMHeadModel': gpt2_weight_loader,
+ 'LlamaForCausalLM': llama_megatron_weight_loader, # use te backend for open-source megatron
+ 'LLaMAForCausalLM': llama_megatron_weight_loader,
+ 'MistralForCausalLM': mistral_megatron_weight_loader,
+}
+
+
+# the actor model is .state_dict()
+# Load megatron weights
+def load_megatron_weights(actor_weights: Dict, vllm_model: nn.Module):
+ weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
+ weight_loader(actor_weights, vllm_model)
+ # NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
+ # after init, and we need this after sync model weights for in first iter.
+ vllm_model = vllm_model.cuda()
+
+
+def _get_model_weight_loader(arch: str):
+ if arch in __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__:
+ return __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def update_megatron_weight_loader():
+ for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
+ layer_class.weight_loader = weight_loader
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/model_loader.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/model_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b675bb79df378b187d856136905104f7aca1146
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/model_loader.py
@@ -0,0 +1,302 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
+
+from typing import Dict, Union, Optional, Iterable, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, MultiModalConfig,
+ ParallelConfig, SchedulerConfig)
+from vllm.model_executor.model_loader import BaseModelLoader
+from vllm.model_executor.model_loader.loader import _initialize_model
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+
+from .config import ModelConfig, LoadFormat, LoadConfig
+from .megatron_weight_loaders import load_megatron_weights, update_megatron_weight_loader
+from .dtensor_weight_loaders import load_dtensor_weights, update_dtensor_weight_loader
+from .hf_weight_loader import update_hf_weight_loader
+
+
+def get_model(actor_model: Union[PreTrainedModel, Dict],
+ model_config: ModelConfig,
+ load_config: LoadConfig,
+ device_config: DeviceConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ lora_config: Optional[LoRAConfig],
+ multimodal_config: Optional[MultiModalConfig],
+ cache_config: CacheConfig = None) -> nn.Module:
+ loader = get_model_loader(load_config)
+ if load_config.load_format.startswith('dummy'):
+ return loader.load_model(model_config=model_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ multimodal_config=multimodal_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ cache_config=cache_config)
+ else:
+ return loader.load_model(actor_model=actor_model,
+ model_config=model_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ multimodal_config=multimodal_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ cache_config=cache_config)
+
+
+def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+ """Get a model loader based on the load format."""
+
+ if isinstance(load_config.load_format, type):
+ return load_config.load_format(load_config)
+
+ if load_config.load_format == LoadFormat.AUTO:
+ update_megatron_weight_loader()
+ return MegatronLoader(load_config)
+
+ # NOTE(sgm): change the weight_loader function in runtime
+ if load_config.load_format == LoadFormat.MEGATRON:
+ update_megatron_weight_loader()
+ return MegatronLoader(load_config)
+
+ if load_config.load_format == LoadFormat.HF:
+ update_hf_weight_loader()
+ return HFLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DTENSOR:
+ update_dtensor_weight_loader()
+ return DTensorLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_HF:
+ update_hf_weight_loader()
+ return DummyModelLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_MEGATRON:
+ update_megatron_weight_loader()
+ return DummyModelLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_DTENSOR:
+ update_dtensor_weight_loader()
+ return DummyModelLoader(load_config)
+
+ raise ValueError('load format not supported in verl: {}, only support {} and {}'.format(
+ load_config.load_format, LoadFormat.MEGATRON, LoadFormat.HF))
+
+
+class DummyModelLoader(BaseModelLoader):
+ """Model loader that will set model weights to random values."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig],
+ multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config,
+ scheduler_config)
+ # NOTE(woosuk): For accurate performance evaluation, we assign
+ # random values to the weights.
+ # initialize_dummy_weights(model)
+ return model.eval()
+
+
+class MegatronLoader(BaseModelLoader):
+ """Model loader that can load the model weights from partitioned megatron model."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
+ # NOTE(shengguangming) Load the weights from the actor model
+ pass
+ # if isinstance(actor_model, nn.Module):
+ # load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
+ # else:
+ # load_weights(actor_weights=actor_model, vllm_model=model)
+ # return actor_model
+
+ def load_model(self, actor_model: Union[PreTrainedModel, Dict], model_config: ModelConfig,
+ device_config: DeviceConfig, lora_config: Optional[LoRAConfig],
+ multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config,
+ scheduler_config)
+
+ # TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
+ if isinstance(actor_model, nn.Module):
+ load_megatron_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
+ vllm_model=model)
+ else:
+ load_megatron_weights(actor_weights=actor_model, vllm_model=model)
+
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+class HFLoader(BaseModelLoader):
+ """Model loader that can load the model weights from model's full params."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def _get_weights_iterator(self, actor_model: Union[PreTrainedModel, Dict]):
+ if isinstance(actor_model, Dict):
+ return actor_model.items()
+ elif isinstance(actor_model, nn.Module):
+ return dict(actor_model.named_parameters()).items()
+ else:
+ raise ValueError(f'actor model should be Dict or nn.Module, but get {type(actor_model)}')
+
+ def load_model(self, actor_model: Union[PreTrainedModel, Dict], model_config: ModelConfig,
+ device_config: DeviceConfig, lora_config: Optional[LoRAConfig],
+ multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ # with torch.device(device_config.device):
+ # NOTE(sgm): init the model in cpu
+ model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config,
+ scheduler_config)
+ model.load_weights(self._get_weights_iterator(actor_model))
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+class DTensorLoader(BaseModelLoader):
+ """Model loader that can load the model weights from partitioned megatron model."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
+ # NOTE(shengguangming) Load the weights from the actor model
+ pass
+ # if isinstance(actor_model, nn.Module):
+ # load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
+ # else:
+ # load_weights(actor_weights=actor_model, vllm_model=model)
+ # return actor_model
+
+ def load_model(self, actor_model: Union[PreTrainedModel, Dict], model_config: ModelConfig,
+ device_config: DeviceConfig, lora_config: Optional[LoRAConfig],
+ multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config,
+ scheduler_config)
+
+ # TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
+ if isinstance(actor_model, nn.Module):
+ load_dtensor_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
+ vllm_model=model)
+ else:
+ load_dtensor_weights(actor_weights=actor_model, vllm_model=model)
+
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+# FIXME(sgm): hack the _get_logits function in vllm v0.4.2
+# as they use ray, the _get_logits result will only need to return to the driver node,
+# therefore gather is enough. However, we use SPMD instead of a central scheduler,
+# all_gather is required (aligned with v0.2.6)
+def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+ embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
+ # Get the logits for the next tokens.
+ logits = torch.matmul(hidden_states, embedding.t())
+ if embedding_bias is not None:
+ logits += embedding_bias
+ logits = tensor_model_parallel_all_gather(logits)
+ # Remove paddings in vocab (if any).
+ if logits is not None:
+ logits = logits[:, :self.org_vocab_size]
+ return logits
+
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+
+
+def logitsprocessor_init(self,
+ vocab_size: int,
+ org_vocab_size: Optional[int] = None,
+ scale: float = 1.0,
+ logits_as_input: bool = False,
+ soft_cap: Optional[float] = None) -> None:
+ """
+ Args:
+ scale: A scaling factor to apply to the logits.
+ """
+ super(LogitsProcessor, self).__init__()
+ self.scale = scale
+ self.vocab_size = vocab_size
+ # Whether the input is logits (default is hidden states).
+ self.logits_as_input = logits_as_input
+ # original vocabulary size (without LoRA).
+ self.org_vocab_size = org_vocab_size or vocab_size
+ # Soft cap the logits. Used in Gemma 2.
+ self.soft_cap = soft_cap
+ # Whether to use gather or all-gather to gather the logits.
+ self.use_gather = False
+
+
+LogitsProcessor.__init__ = logitsprocessor_init # use all_gather
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/model_runner.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/model_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ab232558a4bcdce568cae6e24f658c28628a4e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/model_runner.py
@@ -0,0 +1,150 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
+
+import torch
+import torch.nn as nn
+from enum import IntEnum
+from typing import Dict, List, Optional, Set, Tuple, Union
+import warnings
+
+import vllm.envs as envs
+from vllm.attention import (AttentionMetadata, get_attn_backend)
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, MultiModalConfig, ParallelConfig, PromptAdapterConfig,
+ SchedulerConfig)
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.models.interfaces import (supports_lora, supports_vision)
+from vllm.utils import (CudaMemoryProfiler, is_hip, is_pin_memory_available)
+from vllm.worker.model_runner import ModelRunner, CUDAGraphRunner
+from vllm.prompt_adapter.worker_manager import (LRUCacheWorkerPromptAdapterManager)
+
+from .model_loader import get_model
+from .config import ModelConfig, LoadConfig
+
+logger = init_logger(__name__)
+
+
+# How batches are constructed.
+class BatchType(IntEnum):
+ # Every batch is prefill.
+ PREFILL = 0
+ # Every batch is decode.
+ DECODE = 1
+ # Batch is a mixture of prefill and decode.
+ MIXED = 2
+
+
+class ModelRunner(ModelRunner):
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # [verl] model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ cache_config: CacheConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ kv_cache_dtype: Optional[str] = "auto",
+ is_driver_worker: bool = False,
+ prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+ multimodal_config: Optional[MultiModalConfig] = None,
+ return_hidden_states: bool = False,
+ ):
+
+ super().__init__(
+ model_config,
+ parallel_config,
+ scheduler_config,
+ device_config,
+ cache_config,
+ load_config,
+ lora_config,
+ kv_cache_dtype,
+ is_driver_worker=True, # a hack
+ prompt_adapter_config=prompt_adapter_config,
+ multimodal_config=multimodal_config,
+ return_hidden_states=return_hidden_states)
+
+ # NOTE(sgm): add for verl
+ self.model = model # this will be replaced by get_model()
+
+ # NOTE(sgm): initialize model using the actor model
+ def load_model(self) -> None:
+ logger.info("Starting to load model %s...", self.model_config.model)
+ with CudaMemoryProfiler() as m:
+ self.model = get_model(actor_model=self.model,
+ model_config=self.model_config,
+ device_config=self.device_config,
+ lora_config=self.lora_config,
+ load_config=self.load_config,
+ parallel_config=self.parallel_config,
+ scheduler_config=self.scheduler_config,
+ multimodal_config=self.multimodal_config,
+ cache_config=self.cache_config)
+ self.model_memory_usage = m.consumed_memory
+ logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30))
+
+ if self.lora_config:
+ assert supports_lora(self.model), "Model does not support LoRA"
+ assert not supports_vision(self.model), "To be tested: vision language model with LoRA settings."
+
+ self.lora_manager = LRUCacheWorkerLoRAManager(
+ self.scheduler_config.max_num_seqs,
+ self.scheduler_config.max_num_batched_tokens,
+ self.vocab_size,
+ self.lora_config,
+ self.device,
+ self.model.embedding_modules,
+ self.model.embedding_padding_modules,
+ max_position_embeddings=self.model.config.max_position_embeddings,
+ )
+ self.model = self.lora_manager.create_lora_manager(self.model)
+
+ if self.prompt_adapter_config:
+ self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
+ self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens, self.device,
+ self.prompt_adapter_config)
+ self.model = (self.prompt_adapter_manager.create_prompt_adapter_manager(self.model))
+
+ if self.kv_cache_dtype == "fp8" and is_hip():
+ # Currently only ROCm accepts kv-cache scaling factors
+ # via quantization_param_path and this will be deprecated
+ # in the future.
+ if self.model_config.quantization_param_path is not None:
+ if callable(getattr(self.model, "load_kv_cache_scales", None)):
+ warnings.warn(
+ "Loading kv cache scaling factor from JSON is "
+ "deprecated and will be removed. Please include "
+ "kv cache scaling factors in the model checkpoint.",
+ FutureWarning,
+ stacklevel=2)
+ self.model.load_kv_cache_scales(self.model_config.quantization_param_path)
+ logger.info("Loaded KV cache scaling factors from %s", self.model_config.quantization_param_path)
+ else:
+ raise RuntimeError(
+ "Using FP8 KV cache and scaling factors provided but "
+ "model %s does not support loading scaling factors.", self.model.__class__)
+ else:
+ logger.warning("Using FP8 KV cache but no scaling factors "
+ "provided. Defaulting to scaling factors of 1.0. "
+ "This may lead to less accurate results!")
+
+ if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE:
+ self.model = torch.compile(self.model, fullgraph=True, backend="eager")
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/parallel_state.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/parallel_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..0830093bca658fa4fdb4adc1d449b2dd678b73d5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/parallel_state.py
@@ -0,0 +1,303 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""Model and data parallel groups."""
+import os
+import torch
+import torch.distributed
+from typing import Optional
+
+import vllm.distributed.parallel_state as ps
+from vllm.distributed.parallel_state import get_pp_group, get_world_group, init_distributed_environment, init_model_parallel_group
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+from torch.distributed.device_mesh import init_device_mesh
+
+logger = init_logger(__name__)
+"""
+This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
+- We assume the Megatron tp+dp+pp world is already established before calling this function.
+
+"""
+
+# Device mesh for using DTensor
+_DEVICE_MESH = None
+
+# Tensor model parallel group that the current rank belongs to.
+_TP = None
+# Pipeline model parallel group that the current rank belongs to.
+_PP = None
+
+
+# This method is for initializing the ParallelGroup when using HybridEngine
+def initialize_parallel_state(
+ distributed_init_method: str = "env://",
+ backend: str = "nccl",
+ tensor_model_parallel_size: int = 1,
+ num_tp_per_train_tp: int = 1,
+ pipeline_model_parallel_size: int = 1,
+):
+ # torch.distributed.all_reduce does not free the input tensor until
+ # the synchronization point. This causes the memory usage to grow
+ # as the number of all_reduce calls increases. This env var disables
+ # this behavior.
+ # Related issue:
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+ # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
+ rank = int(os.getenv("RANK", "-1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+ # Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
+ if torch.distributed.get_world_size() > 1:
+ # NOTE: build a sepearate inference group with infer tp & micro dp
+ initialize_model_parallel_for_vllm(tensor_model_parallel_size=tensor_model_parallel_size,
+ num_tensor_model_parallel_groups_per_train_tp=num_tp_per_train_tp)
+ else:
+ initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
+
+
+def ensure_model_parallel_initialized(
+ tensor_model_parallel_size: int,
+ pipeline_model_parallel_size: int = 1,
+ backend: Optional[str] = None,
+) -> None:
+ """Helper to initialize model parallel groups if they are not initialized,
+ or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+ values if the model parallel groups are initialized.
+ """
+ # get the backend of _DEVICE_WORLD_GROUP
+ backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+ if not model_parallel_is_initialized():
+ initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
+ return
+
+ assert (get_tensor_model_parallel_world_size() == tensor_model_parallel_size), (
+ "tensor parallel group already initialized, but of unexpected size: "
+ f"{get_tensor_model_parallel_world_size()=} vs. "
+ f"{tensor_model_parallel_size=}")
+ pp_world_size = get_pp_group().world_size
+ assert (pp_world_size == pipeline_model_parallel_size), (
+ "pipeline parallel group already initialized, but of unexpected size: "
+ f"{pp_world_size=} vs. "
+ f"{pipeline_model_parallel_size=}")
+
+
+# TODO(sgm): deviate from the v0.5.4, not pp now
+def model_parallel_is_initialized():
+ """Check if tensor and pipeline parallel groups are initialized."""
+ return (ps._TP is not None)
+ # and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
+
+
+def initialize_model_parallel_for_vllm(tensor_model_parallel_size: int,
+ num_tensor_model_parallel_groups_per_train_tp: int = 1,
+ pipeline_model_parallel_size: int = 1) -> None:
+ from torch.distributed import new_group
+ # Get world size and rank. Ensure some consistencies.
+ assert torch.distributed.is_initialized()
+
+ assert isinstance(tensor_model_parallel_size, int)
+
+ # assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group
+ # assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group
+
+ # Build the tensor model-parallel groups.
+ assert ps._TP is None, ("tensor model parallel group is already initialized")
+
+ global _TP
+
+ world_size: int = torch.distributed.get_world_size()
+
+ rank = torch.distributed.get_rank()
+
+ backend = torch.distributed.get_backend()
+
+ num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+
+ if num_tensor_model_parallel_groups_per_train_tp == 1:
+ # if tensor_model_parallel_size == train_tensor_parallel_size:
+ # using the same tp group as Megatron/vllm
+ assert _TP is None, ("tensor model parallel group is already initialized")
+ group_ranks = []
+ for i in range(num_tensor_model_parallel_groups):
+ ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+ group_ranks.append(ranks)
+ _TP = init_model_parallel_group(
+ group_ranks=group_ranks,
+ local_rank=get_world_group().local_rank,
+ backend=backend,
+ use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer
+ use_message_queue_broadcaster=True)
+ ps._TP = _TP
+ # _MICRO_DATA_PARALLEL_GROUP is move to hybrid engine
+ else:
+ # initialize a micro_dp group and a tp group
+ # assume training tp=4, infer tp=2, then, weight is partitioned as
+ # [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
+
+ # Build the inference tp groups
+ # train_tp = train_tensor_parallel_size
+ train_tp = num_tensor_model_parallel_groups_per_train_tp * tensor_model_parallel_size
+ # num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
+ assert _TP is None, ("tensor model parallel group is already initialized")
+ group_ranks = []
+ for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
+ start = train_tp * i
+ end = train_tp * (i + 1)
+ for j in range(num_tensor_model_parallel_groups_per_train_tp):
+ ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
+ for i in range(len(ranks)):
+ ranks[i] += j
+ group_ranks.append(ranks)
+ _TP = init_model_parallel_group(
+ group_ranks=group_ranks,
+ local_rank=get_world_group().local_rank,
+ backend=backend,
+ use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer
+ use_message_queue_broadcaster=True)
+ ps._TP = _TP
+
+ # Build the pipeline model-parallel groups.
+ # global _PIPELINE_MODEL_PARALLEL_GROUP
+ # global _PIPELINE_GLOBAL_RANKS
+ # assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
+
+ # ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
+ # ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
+
+ # TODO: init using device mesh (not support hybrid engine now)
+ # Build the pipeline model-parallel groups.
+ num_pipeline_model_parallel_groups: int = (world_size // pipeline_model_parallel_size)
+ global _PP
+ assert _PP is None, ("pipeline model parallel group is already initialized")
+ group_ranks = []
+ for i in range(num_pipeline_model_parallel_groups):
+ ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
+ group_ranks.append(ranks)
+ # pipeline parallel does not need custom allreduce
+ _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False)
+ ps._PP = _PP # for verl
+
+
+def initialize_model_parallel(
+ tensor_model_parallel_size: int = 1,
+ pipeline_model_parallel_size: int = 1,
+ backend: Optional[str] = None,
+) -> None:
+ """
+ NOTE: This method is a hack from the open-sourced version without
+ asertion of world_size = tp * pp
+
+ Initialize model parallel groups.
+
+ Arguments:
+ tensor_model_parallel_size: number of GPUs used for tensor model
+ parallelism.
+ pipeline_model_parallel_size: number of GPUs used for pipeline model
+ parallelism.
+
+ Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+ use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+ the model pipeline. The present function will
+ create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+ 4 tensor model-parallel groups:
+ [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+ 2 pipeline model-parallel groups:
+ [g0, g2, g4, g6], [g1, g3, g5, g7]
+ Note that for efficiency, the caller should make sure adjacent ranks
+ are on the same DGX box. For example if we are using 2 DGX-1 boxes
+ with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+ ranks 8 to 15 belong to the second box.
+ """
+ # Get world size and rank. Ensure some consistencies.
+ assert torch.distributed.is_initialized()
+ world_size: int = torch.distributed.get_world_size()
+ backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group)
+
+ # NOTE(sgm) we don't assert world_size == tp * pp
+ # DP is not managed by vllm but by the veRL WorkerGroup
+ # if (world_size !=
+ # tensor_model_parallel_size * pipeline_model_parallel_size):
+ # raise RuntimeError(
+ # f"world_size ({world_size}) is not equal to "
+ # f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+ # f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
+
+ num_tensor_model_parallel_groups: int = (world_size // tensor_model_parallel_size)
+ rank = torch.distributed.get_rank()
+ global _TP
+ assert _TP is None, ("tensor model parallel group is already initialized")
+ group_ranks = []
+ for i in range(num_tensor_model_parallel_groups):
+ ranks = list(range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size))
+ group_ranks.append(ranks)
+
+ # message queue broadcaster is only used in tensor model parallel group
+ _TP = init_model_parallel_group(
+ group_ranks,
+ get_world_group().local_rank,
+ backend,
+ use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer
+ use_message_queue_broadcaster=True)
+ ps._TP = _TP
+
+ # TODO: init using device mesh (not support hybrid engine now)
+ # Build the pipeline model-parallel groups.
+ num_pipeline_model_parallel_groups: int = (world_size // pipeline_model_parallel_size)
+ global _PP
+ assert _PP is None, ("pipeline model parallel group is already initialized")
+ group_ranks = []
+ for i in range(num_pipeline_model_parallel_groups):
+ ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
+ group_ranks.append(ranks)
+ # pipeline parallel does not need custom allreduce
+ _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False)
+ ps._PP = _PP # for verl
+
+
+"""
+Device mesh utilities
+"""
+
+
+def get_device_mesh():
+ assert _DEVICE_MESH is not None, ("device mesh is not initialized")
+ return _DEVICE_MESH
+
+
+"""
+Tensor model parallel utilities
+"""
+
+
+def get_tensor_model_parallel_group():
+ """Get the tensor model parallel group the caller rank belongs to."""
+ assert _TP is not None, ("tensor model parallel group is not initialized")
+ return _TP.device_group
+
+
+def get_tensor_model_parallel_world_size():
+ """Return world size for the tensor model parallel group."""
+ return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_rank():
+ """Return my rank for the tensor model parallel group."""
+ return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_src_rank():
+ """Calculate the global rank corresponding to the first local rank
+ in the tensor model parallel group."""
+ global_rank = torch.distributed.get_rank()
+ local_world_size = get_tensor_model_parallel_world_size()
+ return (global_rank // local_world_size) * local_world_size
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/spmd_gpu_executor.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/spmd_gpu_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9040d52b57558ced35cc37dcbb96014255ccf95
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/spmd_gpu_executor.py
@@ -0,0 +1,253 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/executor/gpu_executor.py
+
+import os
+import socket
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import torch
+import vllm.envs as envs
+from vllm.executor.executor_base import ExecutorBase, ExecutorAsyncBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SamplerOutput, ExecuteModelRequest
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, MultiModalConfig, ParallelConfig, PromptAdapterConfig,
+ SchedulerConfig, SpeculativeConfig)
+from .config import ModelConfig, LoadConfig
+
+logger = init_logger(__name__)
+
+
+class SPMDGPUExecutor(ExecutorBase):
+ """SPMD-based multi-GPU executor implementations."""
+
+ def __init__(
+ self,
+ model, # pytorch model itself or its parameter dict
+ model_config: ModelConfig,
+ cache_config: CacheConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ multimodal_config: Optional[MultiModalConfig],
+ speculative_config: Optional[SpeculativeConfig],
+ prompt_adapter_config: Optional[PromptAdapterConfig],
+ ) -> None:
+ self.model_config = model_config
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ self.load_config = load_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.multimodal_config = multimodal_config
+ self.speculative_config = speculative_config
+ self.prompt_adapter_config = prompt_adapter_config
+
+ distributed_init_method = initialize_cluster(parallel_config)
+ self._init_executor(model, distributed_init_method)
+
+ # TODO(sgm): verl not support speculative decode now
+ def _init_executor(self, model, distributed_init_method) -> None:
+ assert (not self.speculative_config), "Speculative decoding not yet supported for multi-GPU backend."
+
+ # Create the parallel worker for each GPU.
+ self._init_workers_sp(model, distributed_init_method)
+
+ def _init_workers_sp(self, model, distributed_init_method: str):
+ # Lazy import the Worker to avoid importing torch.cuda/xformers
+ # before CUDA_VISIBLE_DEVICES is set in the Worker
+ from .worker import Worker # pylint: disable=import-outside-toplevel
+
+ rank = int(os.getenv("RANK"))
+ local_rank = int(os.getenv("LOCAL_RANK"))
+ print(f'local rank {local_rank}')
+
+ # see https://github.com/NVIDIA/nccl/issues/1234
+ os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+ self.worker = Worker(
+ model,
+ self.model_config,
+ self.parallel_config,
+ self.scheduler_config,
+ self.device_config,
+ self.cache_config,
+ self.load_config,
+ local_rank,
+ rank,
+ distributed_init_method,
+ lora_config=self.lora_config,
+ multimodal_config=self.multimodal_config,
+ speculative_config=None,
+ prompt_adapter_config=self.speculative_config,
+ is_driver_worker=True,
+ model_runner_cls=None, # use the default one
+ )
+
+ # NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
+ self.worker.init_device()
+ self.worker.load_model()
+
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
+ """Determine the number of available KV blocks.
+
+ This invokes `determine_num_available_blocks` on each worker and takes
+ the min of the results, guaranteeing that the selected cache sizes are
+ compatible with all workers.
+
+ Returns:
+ - tuple[num_gpu_blocks, num_cpu_blocks]
+ """
+ # Get the maximum number of blocks that can be allocated on GPU and CPU.
+ num_blocks = self.worker.determine_num_available_blocks()
+
+ # NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
+ # have its own scheduler
+ num_gpu_blocks = num_blocks[0]
+ num_cpu_blocks = num_blocks[1]
+
+ return num_gpu_blocks, num_cpu_blocks
+
+ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+ """Initialize the KV cache in all workers.
+ """
+
+ # NOTE: We log here to avoid multiple logs when number of workers is
+ # greater than one. We could log in the engine, but not all executors
+ # have GPUs.
+ logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
+
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+ if torch.distributed.get_rank() == 0:
+ print(
+ f'before init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB'
+ )
+ self.worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks)
+ if torch.distributed.get_rank() == 0:
+ print(
+ f'after init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB'
+ )
+
+ # NOTE(sgm): This will not profile & capture the model(CUDAGraph) when rebuilding KVCache
+ def init_cache_engine(self) -> None:
+ self.worker._init_cache_engine()
+
+ def free_cache_engine(self) -> None:
+ self.worker.free_cache_engine()
+
+ def execute_model(self, execute_model_req) -> List[SamplerOutput]:
+ all_outputs = self.worker.execute_model(execute_model_req=execute_model_req)
+
+ # NOTE(sgm):
+ # Each GPU in vllm under verl has its own spmd_gpu_executor, therefore all GPUs should return the outputs
+ # In vllm with ray, only the driver worker returns the sampling results.
+ return all_outputs
+
+ def add_lora(self, lora_request: LoRARequest) -> bool:
+ assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+ return self.worker.add_lora(lora_request=lora_request)
+
+ def remove_lora(self, lora_id: int) -> bool:
+ assert lora_id > 0, "lora_id must be greater than 0."
+ return self.worker.remove_lora(lora_id=lora_id)
+
+ def list_loras(self) -> Set[int]:
+ return self.worker.list_loras()
+
+ def check_health(self) -> None:
+ # SPMDExecutor will always be healthy as long as
+ # it's running.
+ return
+
+ # NOTE(sgm) add for verl to pass the abstract class test, not used
+ from vllm.prompt_adapter.request import PromptAdapterRequest
+
+ def add_prompt_adapter(self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+ assert prompt_adapter_request.prompt_adapter_id > 0, \
+ "prompt_adapter_id must be greater than 0."
+ return self.worker.add_prompt_adapter(prompt_adapter_request)
+
+ def list_prompt_adapters(self) -> Set[int]:
+ return self.worker.list_prompt_adapters()
+
+ def pin_lora(self, lora_id: int) -> bool:
+ assert lora_id > 0, "lora_id must be greater than 0."
+ return self.worker.pin_lora(lora_id)
+
+ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ assert prompt_adapter_id > 0, \
+ "prompt_adapter_id must be greater than 0."
+ return self.worker.pin_prompt_adapter(prompt_adapter_id)
+
+ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ assert prompt_adapter_id > 0, \
+ "prompt_adapter_id must be greater than 0."
+ return self.worker.remove_prompt_adapter(prompt_adapter_id)
+
+ # NOTE(sgm): add for verl
+ def offload_model_weights(self) -> None:
+ self.worker.offload_model_weights()
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.worker.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+
+def initialize_cluster(
+ parallel_config: ParallelConfig,
+ engine_use_ray: bool = False,
+ ray_address: Optional[str] = None,
+) -> Tuple[str, Optional[None]]:
+ """Initialize the distributed cluster probably with Ray.
+
+ Args:
+ parallel_config: The configurations for parallel execution.
+
+ Returns:
+ The `distributed_init_method` is the address for initializing the
+ distributed backend.
+ """
+
+ # Initialize cluster locally.
+ port = get_open_port()
+ # We need to setup the distributed init method to make sure
+ # the distributed megatron code (e.g., get world size) works correctly.
+ # distributed_init_method = f"tcp://localhost:{port}"
+ distributed_init_method = 'env://'
+ return distributed_init_method
+
+
+def get_open_port():
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ s.bind(("", 0))
+ return s.getsockname()[1]
+
+
+# TODO(sgm): not implemented async executor yet
+class SPMDGPUExecutorAsync(SPMDGPUExecutor, ExecutorAsyncBase):
+
+ async def execute_model_async(self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+ """Executes one model step on the given sequences."""
+ raise NotImplementedError
+
+ async def check_health_async(self) -> None:
+ """Checks if the executor is healthy. If not, it should raise an
+ exception."""
+ self.check_health()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/tokenizer.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa625a0338686d61816e838ef802cde327fc95c4
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/tokenizer.py
@@ -0,0 +1,77 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+
+from typing import List, Optional, Tuple, Union
+
+from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast)
+
+from vllm.lora.request import LoRARequest
+from vllm.utils import make_async, LRUCache
+from vllm.transformers_utils.tokenizers import *
+
+
+class TokenizerGroup:
+ """A group of tokenizers that can be used for LoRA adapters."""
+
+ def __init__(self, tokenizer: PreTrainedTokenizer, enable_lora: bool, max_num_seqs: int,
+ max_input_length: Optional[int]):
+ self.enable_lora = enable_lora
+ self.max_input_length = max_input_length
+ self.tokenizer = tokenizer
+ self.lora_tokenizers = LRUCache[PreTrainedTokenizer](capacity=max_num_seqs) if enable_lora else None
+
+ def ping(self) -> bool:
+ """Check if the tokenizer group is alive."""
+ return True
+
+ def get_max_input_len(self, lora_request: Optional[LoRARequest] = None) -> Optional[int]:
+ """Get the maximum input length for the LoRA request."""
+ return self.max_input_length
+
+ def encode(self,
+ prompt: str,
+ request_id: Optional[str] = None,
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
+ tokenizer = self.get_lora_tokenizer(lora_request)
+ return tokenizer.encode(prompt)
+
+ async def encode_async(self,
+ prompt: str,
+ request_id: Optional[str] = None,
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
+ tokenizer = await self.get_lora_tokenizer_async(lora_request)
+ return tokenizer.encode(prompt)
+
+ def get_lora_tokenizer(self, lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
+ if not lora_request or not self.enable_lora:
+ return self.tokenizer
+ if lora_request.lora_int_id not in self.lora_tokenizers:
+ # TODO(sgm): the lora tokenizer is also passed, but may be different
+ tokenizer = self.tokenizer
+ # tokenizer = (get_lora_tokenizer(
+ # lora_request, **self.tokenizer_config) or self.tokenizer)
+ self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+ return tokenizer
+ else:
+ return self.lora_tokenizers.get(lora_request.lora_int_id)
+
+ # FIXME(sgm): for simplicity, we assign the special token here
+ @property
+ def pad_token_id(self):
+ return self.tokenizer.pad_token_id
+
+ @property
+ def eos_token_id(self):
+ return self.tokenizer.eos_token_id
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/worker.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5deb675a1180fc9a575ca0898be27f21c173151
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_5_4/worker.py
@@ -0,0 +1,323 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
+"""A GPU worker class."""
+import os
+import gc
+from typing import Dict, List, Tuple, Optional, Union, Type
+
+import torch
+import torch.distributed
+import torch.nn as nn
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, MultiModalConfig, ParallelConfig, PromptAdapterConfig,
+ SchedulerConfig, SpeculativeConfig)
+from vllm.model_executor import set_random_seed
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SamplerOutput)
+from vllm.worker.cache_engine import CacheEngine
+# TODO(sgm): check why vllm has similar file in vllm.model_executor.parallel_utils.parallel_state
+from vllm.distributed import (init_distributed_environment, set_custom_all_reduce, get_tensor_model_parallel_group)
+from vllm.worker.worker_base import WorkerInput
+from vllm.worker.worker import Worker, _check_if_gpu_supports_dtype
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
+from vllm.worker.embedding_model_runner import EmbeddingModelRunner
+from vllm.worker.model_runner import GPUModelRunnerBase
+from .model_runner import ModelRunner
+from .megatron_weight_loaders import load_megatron_weights
+from .hf_weight_loader import load_hf_weights
+from .dtensor_weight_loaders import load_dtensor_weights
+from .parallel_state import (ensure_model_parallel_initialized)
+from .config import ModelConfig, LoadConfig, LoadFormat
+
+
+class Worker(Worker):
+ """A worker class that executes (a partition of) the model on a GPU.
+
+ Each worker is associated with a single GPU. The worker is responsible for
+ maintaining the KV cache and executing the model on the GPU. In case of
+ distributed inference, each worker is assigned a partition of the model.
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ cache_config: CacheConfig,
+ load_config: LoadConfig,
+ local_rank: int,
+ rank: int,
+ distributed_init_method: str,
+ lora_config: Optional[LoRAConfig] = None,
+ multimodal_config: Optional[MultiModalConfig] = None,
+ speculative_config: Optional[SpeculativeConfig] = None,
+ prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+ is_driver_worker: bool = False,
+ model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
+ ) -> None:
+ # self.model = model # will be replaced in the init_model
+ self.model_config = model_config
+ self.parallel_config = parallel_config
+ self.parallel_config.rank = rank
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.cache_config = cache_config
+ self.local_rank = local_rank
+ self.rank = rank
+ self.distributed_init_method = distributed_init_method
+ self.lora_config = lora_config
+ self.load_config = load_config
+ self.prompt_adapter_config = prompt_adapter_config
+ self.is_driver_worker = is_driver_worker # TODO: we don't need driver
+ # if parallel_config and is_driver_worker:
+ # assert rank % parallel_config.tensor_parallel_size == 0, \
+ # "Driver worker should be rank 0 of tensor parallel group."
+ if self.model_config.trust_remote_code:
+ # note: lazy import to avoid importing torch before initializing
+ from vllm.utils import init_cached_hf_modules
+ init_cached_hf_modules()
+ self.multimodal_config = multimodal_config
+
+ # Return hidden states from target model if the draft model is an
+ # mlp_speculator
+ speculative_args = {} if speculative_config is None \
+ or (speculative_config.draft_model_config.model ==
+ model_config.model) \
+ or (speculative_config.draft_model_config.hf_config.model_type
+ not in ["medusa", "mlp_speculator"]) \
+ else {"return_hidden_states": True}
+
+ # TODO(sgm): set correct model runner class
+ ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+ if model_runner_cls is not None:
+ ModelRunnerClass = model_runner_cls
+ elif self.model_config.embedding_mode:
+ ModelRunnerClass = EmbeddingModelRunner
+ self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
+ model, # [VERL]: add for verl
+ model_config,
+ parallel_config,
+ scheduler_config,
+ device_config,
+ cache_config,
+ load_config=load_config,
+ lora_config=self.lora_config,
+ kv_cache_dtype=self.cache_config.cache_dtype,
+ is_driver_worker=is_driver_worker,
+ prompt_adapter_config=prompt_adapter_config,
+ multimodal_config=multimodal_config,
+ **speculative_args,
+ )
+
+ # Uninitialized cache engine. Will be initialized by
+ # initialize_cache.
+ self.cache_engine: List[CacheEngine] = None
+ # Initialize gpu_cache as embedding models don't initialize kv_caches
+ self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
+
+ # NOTE(sgm): [VERL] For offloading inference engine params
+ self.cpu_model = None
+
+ def init_device(self) -> None:
+ if self.device_config.device.type == "cuda":
+ # torch.distributed.all_reduce does not free the input tensor until
+ # the synchronization point. This causes the memory usage to grow
+ # as the number of all_reduce calls increases. This env var disables
+ # this behavior.
+ # Related issue:
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+ # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
+ self.rank = self.rank if self.rank is not None else int(os.getenv("RANK", "-1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+ self.device = torch.device(f"cuda:{local_rank}")
+ if self.rank < 0:
+ raise ValueError("Invalid or unspecified rank.")
+ torch.cuda.set_device(self.device)
+
+ # Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ self.parallel_config.world_size = world_size
+
+ _check_if_gpu_supports_dtype(self.model_config.dtype)
+ torch.cuda.empty_cache()
+ self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+ else:
+ raise RuntimeError(f"Not support device type: {self.device_config.device}")
+
+ # Initialize the distributed environment.
+ init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method,
+ self.local_rank)
+ # Set random seed.
+ set_random_seed(self.model_config.seed)
+ # self.model = get_model(actor_model=self.model, model_config=self.model_config)
+
+ @torch.inference_mode()
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
+ """Profiles the peak memory usage of the model to determine how many
+ KV blocks may be allocated without OOMs.
+
+ The engine will first conduct a profiling of the existing memory usage.
+ Then, it calculate the maximum possible number of GPU and CPU blocks
+ that can be allocated with the remaining free memory.
+
+ .. tip::
+ You may limit the usage of GPU memory
+ by adjusting the `gpu_memory_utilization` parameter.
+ """
+ # Profile the memory usage of the model and get the maximum number of
+ # cache blocks that can be allocated with the remaining free memory.
+ torch.cuda.empty_cache()
+ # torch.cuda.reset_peak_memory_stats()
+
+ # Execute a forward pass with dummy inputs to profile the memory usage
+ # of the model.
+ self.model_runner.profile_run()
+
+ # Calculate the number of blocks that can be allocated with the
+ # profiled peak memory.
+ torch.cuda.synchronize()
+ free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+ peak_memory = total_gpu_memory - free_gpu_memory
+
+ assert peak_memory > 0, ("Error in memory profiling. This happens when the GPU memory was "
+ "not properly cleaned up before initializing the vLLM instance.")
+
+ cache_block_size = self.get_cache_block_size_bytes()
+
+ # NOTE(sgm) [VERL] use the remaining memory
+ num_gpu_blocks = int((free_gpu_memory * self.cache_config.gpu_memory_utilization) // cache_block_size)
+ # num_gpu_blocks = int((total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // cache_block_size)
+
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size)
+ num_gpu_blocks = max(num_gpu_blocks, 0)
+ num_cpu_blocks = max(num_cpu_blocks, 0)
+ if self.model_runner.lora_manager:
+ self.model_runner.remove_all_loras()
+
+ # NOTE(sgm): Add for [VERL], synchronize number of blocks with all the rank
+ num_gpu_blocks = torch.tensor([num_gpu_blocks], device='cuda')
+ num_cpu_blocks = torch.tensor([num_cpu_blocks], device='cuda')
+
+ torch.distributed.all_reduce(num_gpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group().device_group)
+ torch.distributed.all_reduce(num_cpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group().device_group)
+ num_gpu_blocks = num_gpu_blocks.item()
+ num_cpu_blocks = num_cpu_blocks.item()
+ gc.collect()
+ torch.cuda.empty_cache()
+ return num_gpu_blocks, num_cpu_blocks
+
+ def _init_cache_engine(self):
+ if self.cache_engine is None and self.gpu_cache is None:
+ super()._init_cache_engine()
+
+ def free_cache_engine(self):
+ # ensure `enforce_eager=True`
+ self.cache_engine = None
+ self.gpu_cache = None
+
+ # NOTE(sgm): [VERL]: adapt from _execute_model_spmd()
+ def execute_model(self,
+ execute_model_req: ExecuteModelRequest,
+ intermediate_tensors: Optional[IntermediateTensors] = None) -> Optional[List[SamplerOutput]]:
+ """
+ Execute model in Single Program Multiple Data (SPMD) fashion.
+ All workers take the same request, prepare the input and
+ execute the model.
+ """
+ assert execute_model_req is not None, ("_execute_model_spmd() requires each worker to take in an "
+ "ExecuteModelRequest")
+ worker_input: WorkerInput = self.prepare_worker_input(execute_model_req=execute_model_req)
+ model_input: ModelRunnerInputBase = (self.model_runner.prepare_model_input(
+ execute_model_req.seq_group_metadata_list))
+
+ # verl.worker.workerbase.WorkerBase
+ # swap cache
+ super().execute_worker(worker_input)
+
+ # If there is no input, we don't need to execute the model.
+ if worker_input.num_seq_groups == 0:
+ return []
+
+ return self.model_runner.execute_model(
+ model_input, self.kv_cache[worker_input.virtual_engine] if self.kv_cache is not None else None,
+ intermediate_tensors)
+
+ # assume the input is .state_dict()
+ def sync_model_weights(self, actor_weights: Dict, load_format: str):
+ if load_format in [LoadFormat.MEGATRON, LoadFormat.AUTO]:
+ load_megatron_weights(actor_weights, self.model_runner.model)
+ elif load_format == LoadFormat.HF:
+ # full model state dict without no sharding
+ load_hf_weights(actor_weights, self.model_runner.model)
+ elif load_format == LoadFormat.DTENSOR:
+ load_dtensor_weights(actor_weights, self.model_runner.model)
+
+ def offload_model_weights(self) -> None:
+ if self.cpu_model == None:
+ self.cpu_model = {}
+ for name, params in self.model_runner.model.named_parameters():
+ self.cpu_model[name] = torch.empty_like(params, device='cpu')
+ params.data = self.cpu_model[name]
+ else:
+ for name, params in self.model_runner.model.named_parameters():
+ params.data = self.cpu_model[name]
+
+
+def init_worker_distributed_environment(
+ parallel_config: ParallelConfig,
+ rank: int,
+ distributed_init_method: Optional[str] = "env://",
+ local_rank: int = -1,
+) -> None:
+ """Initialize the distributed environment."""
+ set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+ # NOTE(sgm) use tcp://localhost:xxxx will hang in HF setting without megatron
+ init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank)
+
+ ensure_model_parallel_initialized(tensor_model_parallel_size=parallel_config.tensor_parallel_size,
+ pipeline_model_parallel_size=parallel_config.pipeline_parallel_size)
+
+ # TODO(sgm): check whether need this
+ # if pynccl_utils.is_initialized():
+ # pynccl_world_size = pynccl_utils.get_world_size()
+ # if pynccl_world_size != parallel_config.world_size:
+ # raise RuntimeError(
+ # "pynccl is already initialized but the pynccl world "
+ # "size does not match parallel_config.world_size "
+ # f"({pynccl_world_size} vs. {parallel_config.world_size}).")
+ # elif parallel_config.world_size > 1:
+ # # NOTE(woosuk): We don't initialize pynccl process group when world size
+ # # is 1.
+ # # NOTE(kaichao): By default, pynccl is initialized for tp group.
+ # pynccl_utils.init_process_group(
+ # group=get_tensor_model_parallel_cpu_group())
+
+ # # Initialize a custom fast all-reduce implementation.
+ # if not parallel_config.disable_custom_all_reduce:
+ # init_custom_ar()
+
+ # A small all_reduce for warmup.
+ torch.distributed.all_reduce(torch.zeros(1).cuda())
+ # if pynccl_utils.is_initialized():
+ # pynccl_utils.all_reduce(torch.zeros(1).cuda())
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/__init__.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc4685c5f7968e827491f2bc02fdbd59bfac220c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/arg_utils.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
+
+import os
+from dataclasses import dataclass
+
+from transformers import PretrainedConfig
+from vllm.config import EngineConfig
+from vllm.engine.arg_utils import EngineArgs
+
+from .config import LoadConfig, ModelConfig
+
+
+@dataclass
+class EngineArgs(EngineArgs):
+ model_hf_config: PretrainedConfig = None # for verl
+
+ def __post_init__(self):
+ pass
+
+ def create_model_config(self) -> ModelConfig:
+ return ModelConfig(
+ hf_config=self.model_hf_config,
+ tokenizer_mode=self.tokenizer_mode,
+ trust_remote_code=self.trust_remote_code,
+ dtype=self.dtype,
+ seed=self.seed,
+ revision=self.revision,
+ code_revision=self.code_revision,
+ rope_scaling=self.rope_scaling,
+ rope_theta=self.rope_theta,
+ tokenizer_revision=self.tokenizer_revision,
+ max_model_len=self.max_model_len,
+ quantization=self.quantization,
+ quantization_param_path=self.quantization_param_path,
+ enforce_eager=self.enforce_eager,
+ max_context_len_to_capture=self.max_context_len_to_capture,
+ max_seq_len_to_capture=self.max_seq_len_to_capture,
+ max_logprobs=self.max_logprobs,
+ disable_sliding_window=self.disable_sliding_window,
+ skip_tokenizer_init=self.skip_tokenizer_init,
+ served_model_name=self.served_model_name,
+ limit_mm_per_prompt=self.limit_mm_per_prompt,
+ use_async_output_proc=not self.disable_async_output_proc,
+ override_neuron_config=self.override_neuron_config,
+ config_format=self.config_format,
+ mm_processor_kwargs=self.mm_processor_kwargs,
+ )
+
+ def create_load_config(self) -> LoadConfig:
+ return LoadConfig(
+ load_format=self.load_format,
+ download_dir=self.download_dir,
+ model_loader_extra_config=self.model_loader_extra_config,
+ ignore_patterns=self.ignore_patterns,
+ )
+
+ def create_engine_config(self) -> EngineConfig:
+ engine_config = super().create_engine_config()
+
+ # NOTE[VERL]: Use the world_size set by torchrun
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ engine_config.parallel_config.world_size = world_size
+
+ return engine_config
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/config.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7cee451416eb1d7d6c9b4b83fc53dc25a336ccf
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/config.py
@@ -0,0 +1,105 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
+
+import enum
+import json
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional, Union
+
+from transformers import PretrainedConfig
+
+# Add for verl
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.utils import is_hip
+
+if TYPE_CHECKING:
+ from vllm.model_executor.model_loader.loader import BaseModelLoader
+
+logger = init_logger(__name__)
+
+
+class LoadFormat(str, enum.Enum):
+ AUTO = "auto"
+ MEGATRON = "megatron"
+ HF = "hf"
+ DTENSOR = "dtensor"
+ DUMMY_HF = "dummy_hf"
+ DUMMY_MEGATRON = "dummy_megatron"
+ DUMMY_DTENSOR = "dummy_dtensor"
+
+
+class ModelConfig(ModelConfig):
+
+ def __init__(self, hf_config: PretrainedConfig, *args, **kwargs) -> None:
+ super().__init__(model=hf_config._name_or_path, tokenizer=hf_config._name_or_path, *args, **kwargs)
+ self.hf_config = hf_config
+
+
+@dataclass
+class LoadConfig:
+ """
+ download_dir: Directory to download and load the weights, default to the
+ default cache directory of huggingface.
+ load_format: The format of the model weights to load:
+ "auto" will try to load the weights in the safetensors format and
+ fall back to the pytorch bin format if safetensors format is
+ not available.
+ "pt" will load the weights in the pytorch bin format.
+ "safetensors" will load the weights in the safetensors format.
+ "npcache" will load the weights in pytorch format and store
+ a numpy cache to speed up the loading.
+ "dummy" will initialize the weights with random values, which is
+ mainly for profiling.
+ "tensorizer" will use CoreWeave's tensorizer library for
+ fast weight loading.
+ "bitsandbytes" will load nf4 type weights.
+ ignore_patterns: The list of patterns to ignore when loading the model.
+ Default to "original/**/*" to avoid repeated loading of llama's
+ checkpoints.
+
+ """
+
+ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+ download_dir: Optional[str] = None
+ model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
+ ignore_patterns: Optional[Union[List[str], str]] = None
+
+ def __post_init__(self):
+ model_loader_extra_config = self.model_loader_extra_config or {}
+ if isinstance(model_loader_extra_config, str):
+ self.model_loader_extra_config = json.loads(model_loader_extra_config)
+ self._verify_load_format()
+
+ if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+ logger.info("Ignoring the following patterns when downloading weights: %s", self.ignore_patterns)
+ else:
+ self.ignore_patterns = ["original/**/*"]
+
+ def _verify_load_format(self) -> None:
+ if not isinstance(self.load_format, str):
+ return
+
+ load_format = self.load_format.lower()
+ self.load_format = LoadFormat(load_format)
+
+ rocm_not_supported_load_format: List[str] = []
+ if is_hip() and load_format in rocm_not_supported_load_format:
+ rocm_supported_load_format = [
+ f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format)
+ ]
+ raise ValueError(f"load format '{load_format}' is not supported in ROCm. "
+ f"Supported load formats are "
+ f"{rocm_supported_load_format}")
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/dtensor_weight_loaders.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/dtensor_weight_loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3042cabcc4112472b4bbf70a540471eae9e4073
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/dtensor_weight_loaders.py
@@ -0,0 +1,380 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
+
+from typing import Dict
+
+import torch.nn as nn
+from torch.distributed._tensor import DTensor
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import is_pp_missing_parameter
+
+
+def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ for param_name, shard_name, shard_id in stacked_params_mapping:
+ if shard_name not in name:
+ continue
+ stacked_name = name.replace(shard_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if stacked_name.endswith(".bias") and stacked_name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[stacked_name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # lm_head is not used in vllm as it is tied with embed_token.
+ # To prevent errors, skip loading lm_head.weight.
+ if "lm_head.weight" in name:
+ continue
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def gptbigcode_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "lm_head.weight" in name:
+ continue
+ if ".attn.bias" in name:
+ # Skip attention mask.
+ # NOTE: "c_attn.bias" should not be skipped.
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def starcoder2_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ]
+
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def llama_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ (".qkv_proj", ".q_proj", "q"),
+ (".qkv_proj", ".k_proj", "k"),
+ (".qkv_proj", ".v_proj", "v"),
+ (".gate_up_proj", ".gate_proj", 0),
+ (".gate_up_proj", ".up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+ # Models trained using ColossalAI may include these tensors in
+ # the checkpoint. Skip them.
+ continue
+ # With tie_word_embeddings, we can skip lm_head.weight
+ # The weight might appear unnecessarily in the files if the model is
+ # processed with quantization, LoRA, fine-tuning, etc.
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight)
+
+
+def qwen2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def qwen2vl_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
+ continue
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+from vllm.model_executor.layers.fused_moe import FusedMoE
+
+
+def deepseekv2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+
+ # Params for weights, fp8 weight scales, fp8 activation scales
+ # (param_name, weight_name, expert_id, shard_id)
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
+ ckpt_gate_proj_name="gate_proj",
+ ckpt_down_proj_name="down_proj",
+ ckpt_up_proj_name="up_proj",
+ num_experts=vllm_model.config.n_routed_experts,
+ )
+
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ # Skip non-stacked layers and experts (experts handled below).
+ if weight_name not in name:
+ continue
+ # We have mlp.experts[0].gate_proj in the checkpoint.
+ # Since we handle the experts below in expert_params_mapping,
+ # we need to skip here BEFORE we update the name, otherwise
+ # name will be updated to mlp.experts[0].gate_up_proj, which
+ # will then be updated below in expert_params_mapping
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+ if ("mlp.experts." in name) and name not in params_dict:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+
+ if is_pp_missing_parameter(name, vllm_model):
+ continue
+
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ for mapping in expert_params_mapping:
+ param_name, weight_name, expert_id, shard_id = mapping
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+
+ if is_pp_missing_parameter(name, vllm_model):
+ continue
+
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(
+ param,
+ local_loaded_weight.to(dtype=param.dtype),
+ weight_name,
+ shard_id=shard_id,
+ expert_id=expert_id,
+ )
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+
+ if is_pp_missing_parameter(name, vllm_model):
+ continue
+
+ param = params_dict[name]
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+
+
+def gpt2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ pass
+
+
+def redistribute_dtensor(param_name: str, loaded_weights: DTensor, parallelize_plan: Dict = None):
+ param_name = _process_parameter_names(name=param_name)
+ if parallelize_plan is not None:
+ assert (
+ param_name
+ in parallelize_plan.keys()), f"param name: {param_name} not in parallelize_plan :{parallelize_plan.keys()}"
+ placement = parallelize_plan[param_name]
+ local_loaded_weights = loaded_weights.redistribute(device_mesh=loaded_weights.device_mesh,
+ placements=placement).to_local()
+ else:
+ local_loaded_weights = loaded_weights.full_tensor()
+ return local_loaded_weights
+
+
+def _process_parameter_names(name):
+ # Remove '.weight' if it exists at the end of the string
+ if name.endswith(".weight"):
+ name = name[:-7]
+
+ # Remove 'model.layers.x.' or 'model.' prefix
+ if "model.layers" in name:
+ parts = name.split(".")
+ # Reconstruct the string without 'model.layers.x.'
+ name = ".".join(parts[3:]) # parts[0] is 'model', parts[1] is 'layers', parts[2] is 'x'
+ elif name.startswith("model."):
+ name = name[6:] # Remove 'model.'
+
+ return name
+
+
+__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__ = {
+ "GPT2LMHeadModel": gpt2_dtensor_weight_loader,
+ "LlamaForCausalLM": llama_dtensor_weight_loader,
+ "LLaMAForCausalLM": llama_dtensor_weight_loader,
+ "MistralForCausalLM": llama_dtensor_weight_loader, # mistral is the same as llama in vLLM
+ "InternLMForCausalLM": llama_dtensor_weight_loader,
+ "AquilaModel": llama_dtensor_weight_loader,
+ "AquilaForCausalLM": llama_dtensor_weight_loader,
+ "Phi3ForCausalLM": llama_dtensor_weight_loader,
+ "GemmaForCausalLM": gemma_dtensor_weight_loader,
+ "Gemma2ForCausalLM": gemma_dtensor_weight_loader,
+ "GPTBigCodeForCausalLM": gptbigcode_dtensor_load_weights,
+ "Starcoder2ForCausalLM": starcoder2_dtensor_load_weights,
+ "Qwen2ForCausalLM": qwen2_dtensor_weight_loader,
+ "DeepseekV2ForCausalLM": deepseekv2_dtensor_weight_loader,
+ "Qwen2VLForConditionalGeneration": qwen2vl_dtensor_weight_loader,
+}
+
+
+# the actor model is .state_dict()
+# Load dtensor weights
+def load_dtensor_weights(actor_weights: Dict, vllm_model: nn.Module):
+ weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
+ weight_loader(actor_weights, vllm_model)
+ # NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
+ # after init, and we need this after sync model weights for in first iter.
+ vllm_model = vllm_model.cuda()
+
+
+def _get_model_weight_loader(arch: str):
+ if arch in __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__:
+ return __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__.keys()}")
+
+
+# NOTE(sgm): we use per-parameter weight loader in each vllm sub
+def update_dtensor_weight_loader():
+ pass
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/hf_weight_loader.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/hf_weight_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3e5b22b2fed3b17f22f66c7acef8094c1c7871a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/hf_weight_loader.py
@@ -0,0 +1,41 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
+
+from typing import Dict
+
+import torch.nn as nn
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+
+
+def update_hf_weight_loader():
+ print("no hf weight loader need to be updated")
+ return
+
+
+def load_hf_weights(actor_weights: Dict, vllm_model: nn.Module):
+ assert isinstance(actor_weights, Dict)
+ with set_default_torch_dtype(next(vllm_model.parameters()).dtype): # TODO
+ if vllm_model.config.tie_word_embeddings and "lm_head.weight" in actor_weights.keys():
+ del actor_weights["lm_head.weight"]
+ vllm_model.load_weights(actor_weights.items())
+ for _, module in vllm_model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ vllm_model = vllm_model.cuda()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd3d646db46e0b6085a94a49da695d5a6feb1403
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm.py
@@ -0,0 +1,200 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pad_sequence
+from transformers import PretrainedConfig, PreTrainedTokenizer, PreTrainedTokenizerFast
+from verl.workers.rollout.tokenizer import HybridEngineBaseTokenizer
+from vllm import LLM
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.utils import Counter
+
+from .arg_utils import EngineArgs
+from .llm_engine_sp import LLMEngine
+
+
+class LLM(LLM):
+ """An LLM for generating texts from given prompts and sampling parameters.
+
+ This class includes a tokenizer, a language model (possibly distributed
+ across multiple GPUs), and GPU memory space allocated for intermediate
+ states (aka KV cache). Given a batch of prompts and sampling parameters,
+ this class generates texts from the model, using an intelligent batching
+ mechanism and efficient memory management.
+
+ NOTE: This class is intended to be used for offline inference. For online
+ serving, use the `AsyncLLMEngine` class instead.
+ NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+
+ Args:
+ model: A HuggingFace Transformers model instance.
+ tokenizer: A HuggingFace Transformers tokenizer instance.
+ tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+ if available, and "slow" will always use the slow tokenizer.
+ trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+ downloading the model and tokenizer.
+ tensor_parallel_size: The number of GPUs to use for distributed
+ execution with tensor parallelism.
+ dtype: The data type for the model weights and activations. Currently,
+ we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+ the `torch_dtype` attribute specified in the model config file.
+ However, if the `torch_dtype` in the config is `float32`, we will
+ use `float16` instead.
+ quantization: The method used to quantize the model weights. Currently,
+ we support "awq". If None, we assume the model weights are not
+ quantized and use `dtype` to determine the data type of the weights.
+ revision: The specific model version to use. It can be a branch name,
+ a tag name, or a commit id.
+ tokenizer_revision: The specific tokenizer version to use. It can be a
+ branch name, a tag name, or a commit id.
+ seed: The seed to initialize the random number generator for sampling.
+ gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+ reserve for the model weights, activations, and KV cache. Higher
+ values will increase the KV cache size and thus improve the model's
+ throughput. However, if the value is too high, it may cause out-of-
+ memory (OOM) errors.
+ swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+ This can be used for temporarily storing the states of the requests
+ when their `best_of` sampling parameters are larger than 1. If all
+ requests will have `best_of=1`, you can safely set this to 0.
+ Otherwise, too small values may cause out-of-memory (OOM) errors.
+ enforce_eager: Whether to enforce eager execution. If True, we will
+ disable CUDA graph and always execute the model in eager mode.
+ If False, we will use CUDA graph and eager execution in hybrid.
+ max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+ When a sequence has context length larger than this, we fall back
+ to eager mode.
+ disable_custom_all_reduce: See ParallelConfig
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
+ model_hf_config: PretrainedConfig,
+ tokenizer_mode: str = "auto",
+ trust_remote_code: bool = False,
+ skip_tokenizer_init: bool = False,
+ tensor_parallel_size: int = 1,
+ dtype: str = "auto",
+ quantization: Optional[str] = None,
+ revision: Optional[str] = None,
+ tokenizer_revision: Optional[str] = None,
+ seed: int = 0,
+ gpu_memory_utilization: float = 0.9,
+ swap_space: int = 4,
+ cpu_offload_gb: float = 0,
+ enforce_eager: bool = False,
+ max_context_len_to_capture: Optional[int] = None,
+ max_seq_len_to_capture: int = 8192,
+ disable_custom_all_reduce: bool = False,
+ load_format="auto",
+ **kwargs,
+ ) -> None:
+ if "disable_log_stats" not in kwargs:
+ kwargs["disable_log_stats"] = True
+ removed_vision_keys = ("image_token_id", "image_feature_size", "image_input_shape", "image_input_type")
+ if any(k in kwargs for k in removed_vision_keys):
+ raise TypeError("There is no need to pass vision-related arguments anymore.")
+ engine_args = EngineArgs(
+ model_hf_config=model_hf_config,
+ # tokenizer=tokenizer,
+ tokenizer_mode=tokenizer_mode,
+ skip_tokenizer_init=skip_tokenizer_init,
+ trust_remote_code=trust_remote_code,
+ tensor_parallel_size=tensor_parallel_size,
+ dtype=dtype,
+ quantization=quantization,
+ revision=revision,
+ tokenizer_revision=tokenizer_revision,
+ seed=seed,
+ gpu_memory_utilization=gpu_memory_utilization,
+ swap_space=swap_space,
+ cpu_offload_gb=cpu_offload_gb,
+ enforce_eager=enforce_eager,
+ max_context_len_to_capture=max_context_len_to_capture,
+ max_seq_len_to_capture=max_seq_len_to_capture,
+ disable_custom_all_reduce=disable_custom_all_reduce,
+ load_format=load_format,
+ **kwargs,
+ )
+ tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
+ if not isinstance(tokenizer, tokenizer_cls):
+ raise ValueError(
+ f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
+ "one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
+ )
+ self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args) # TODO: check usagecontext
+ self.request_counter = Counter()
+
+ def init_cache_engine(self):
+ self.llm_engine.init_cache_engine()
+
+ def free_cache_engine(self):
+ self.llm_engine.free_cache_engine()
+
+ def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+ return self.llm_engine.tokenizer
+
+ def set_tokenizer(
+ self,
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+ ) -> None:
+ self.llm_engine.tokenizer = tokenizer
+
+ def _run_engine(self, *, use_tqdm: bool) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+ outputs = super()._run_engine(use_tqdm=use_tqdm)
+ return self._post_process_outputs(outputs)
+
+ # # NOTE(shengguangming): add for verl
+ # # TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
+ # def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
+ # # remove the left padding in the prompt token_id
+ # pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ # non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+ # token_ids = prompt_token_ids[non_pad_index:].tolist()
+ # return token_ids
+
+ # NOTE(shengguangming): add for verl
+ def _post_process_outputs(self, request_outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
+ output_token_ids = []
+ logprobs = []
+ for request_output in request_outputs: # List[RequestOutput]
+ outputs = request_output.outputs
+ for output in outputs: # List[CompletionOutput], usually len == 1
+ output_token_ids.append(torch.tensor(output.token_ids))
+ # TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
+ logprobs_dicts = output.logprobs
+ if logprobs_dicts is not None:
+ logprob = []
+ for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
+ logprob.append(logprobs_dict[id].logprob)
+ logprobs.append(torch.tensor(logprob))
+
+ pad_token_id = (self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None
+ else self.llm_engine.tokenizer.eos_token_id)
+ output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
+ if len(logprobs) > 0:
+ logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
+ return output_token_ids, logprobs
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.llm_engine.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+ def offload_model_weights(self) -> None:
+ self.llm_engine.offload_model_weights()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm_engine_sp.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm_engine_sp.py
new file mode 100644
index 0000000000000000000000000000000000000000..10b112b2595d83514698589bba472efb07dea562
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/llm_engine_sp.py
@@ -0,0 +1,408 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
+
+from functools import partial
+from typing import Callable, Dict, Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from vllm.config import (
+ CacheConfig,
+ DecodingConfig,
+ DeviceConfig,
+ EngineConfig,
+ LoadConfig,
+ LoRAConfig,
+ ModelConfig,
+ ObservabilityConfig,
+ ParallelConfig,
+ PromptAdapterConfig,
+ SchedulerConfig,
+ SpeculativeConfig,
+)
+from vllm.core.scheduler import Scheduler
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine, SchedulerContext, SchedulerOutputState, _load_generation_config_dict
+from vllm.engine.metrics_types import StatLoggerBase
+from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.executor.executor_base import ExecutorBase
+from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.sequence import Sequence
+from vllm.tracing import init_tracer
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.usage.usage_lib import UsageContext, is_usage_stats_enabled, usage_message
+from vllm.utils import Counter, weak_bind
+from vllm.version import __version__ as VLLM_VERSION
+
+from .arg_utils import EngineArgs
+from .config import LoadConfig, ModelConfig
+from .tokenizer import TokenizerGroup
+
+logger = init_logger(__name__)
+_LOCAL_LOGGING_INTERVAL_SEC = 5
+
+
+class LLMEngine(LLMEngine):
+ """An LLM engine that receives requests and generates texts.
+
+ This is the main class for the vLLM engine. It receives requests
+ from clients and generates texts from the LLM. It includes a tokenizer, a
+ language model (possibly distributed across multiple GPUs), and GPU memory
+ space allocated for intermediate states (aka KV cache). This class utilizes
+ iteration-level scheduling and efficient memory management to maximize the
+ serving throughput.
+
+ The :class:`~vllm.LLM` class wraps this class for offline batched inference
+ and the :class:`AsyncLLMEngine` class wraps this class for online serving.
+
+ The config arguments are derived from :class:`~vllm.EngineArgs`. (See
+ :ref:`engine_args`)
+
+ Args:
+ model_config: The configuration related to the LLM model.
+ cache_config: The configuration related to the KV cache memory
+ management.
+ parallel_config: The configuration related to distributed execution.
+ scheduler_config: The configuration related to the request scheduler.
+ device_config: The configuration related to the device.
+ lora_config (Optional): The configuration related to serving multi-LoRA.
+ speculative_config (Optional): The configuration related to speculative
+ decoding.
+ executor_class: The model executor class for managing distributed
+ execution.
+ prompt_adapter_config (Optional): The configuration related to serving
+ prompt adapters.
+ log_stats: Whether to log statistics.
+ usage_context: Specified entry point, used for usage info collection.
+ """
+
+ def __init__(
+ self,
+ # NOTE(sgm): first two arguments are added for verl
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ tokenizer: nn.Module,
+ # NOTE(sgm): vllm original arguments
+ model_config: ModelConfig,
+ cache_config: CacheConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ speculative_config: Optional[SpeculativeConfig],
+ decoding_config: Optional[DecodingConfig],
+ observability_config: Optional[ObservabilityConfig],
+ prompt_adapter_config: Optional[PromptAdapterConfig],
+ executor_class: Type[ExecutorBase],
+ log_stats: bool,
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+ input_registry: InputRegistry = INPUT_REGISTRY,
+ use_cached_outputs: bool = False,
+ ) -> None:
+ logger.info(
+ "Initializing an LLM engine (v%s) with config: "
+ "model=%r, speculative_config=%r, tokenizer=%r, "
+ "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
+ "override_neuron_config=%s, "
+ "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
+ "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
+ "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
+ "pipeline_parallel_size=%d, "
+ "disable_custom_all_reduce=%s, quantization=%s, "
+ "enforce_eager=%s, kv_cache_dtype=%s, "
+ "quantization_param_path=%s, device_config=%s, "
+ "decoding_config=%r, observability_config=%r, "
+ "seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
+ "num_scheduler_steps=%d, chunked_prefill_enabled=%s "
+ "multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
+ "use_async_output_proc=%s, use_cached_outputs=%s, "
+ "mm_processor_kwargs=%s)",
+ VLLM_VERSION,
+ model_config.model,
+ speculative_config,
+ model_config.tokenizer,
+ model_config.skip_tokenizer_init,
+ model_config.tokenizer_mode,
+ model_config.revision,
+ model_config.override_neuron_config,
+ model_config.rope_scaling,
+ model_config.rope_theta,
+ model_config.tokenizer_revision,
+ model_config.trust_remote_code,
+ model_config.dtype,
+ model_config.max_model_len,
+ load_config.download_dir,
+ load_config.load_format,
+ parallel_config.tensor_parallel_size,
+ parallel_config.pipeline_parallel_size,
+ parallel_config.disable_custom_all_reduce,
+ model_config.quantization,
+ model_config.enforce_eager,
+ cache_config.cache_dtype,
+ model_config.quantization_param_path,
+ device_config.device,
+ decoding_config,
+ observability_config,
+ model_config.seed,
+ model_config.served_model_name,
+ scheduler_config.use_v2_block_manager,
+ scheduler_config.num_scheduler_steps,
+ scheduler_config.chunked_prefill_enabled,
+ scheduler_config.multi_step_stream_outputs,
+ cache_config.enable_prefix_caching,
+ model_config.use_async_output_proc,
+ use_cached_outputs,
+ model_config.mm_processor_kwargs,
+ )
+ # TODO(woosuk): Print more configs in debug mode.
+ self.model_config = model_config
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.speculative_config = speculative_config
+ self.load_config = load_config
+ self.decoding_config = decoding_config or DecodingConfig()
+ self.prompt_adapter_config = prompt_adapter_config
+ self.observability_config = observability_config or ObservabilityConfig()
+ self.log_stats = log_stats
+ self.use_cached_outputs = use_cached_outputs
+
+ if not self.model_config.skip_tokenizer_init:
+ self.tokenizer = self._init_tokenizer(tokenizer)
+ self.detokenizer = Detokenizer(self.tokenizer)
+ tokenizer_group = self.get_tokenizer_group()
+ else:
+ self.tokenizer = None
+ self.detokenizer = None
+ tokenizer_group = None
+
+ # Ensure that the function doesn't contain a reference to self,
+ # to avoid engine GC issues
+ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
+ assert tokenizer_group, "tokenizer_group cannot be None, " "make sure skip_tokenizer_init is False"
+ return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
+
+ self.seq_counter = Counter()
+ self.generation_config_fields = _load_generation_config_dict(model_config)
+
+ self.input_preprocessor = InputPreprocessor(model_config, self.tokenizer)
+
+ self.input_registry = input_registry
+ self.input_processor = input_registry.create_input_processor(model_config)
+
+ self.model_executor = executor_class(
+ model=model, # add for spmd_gpu_executor
+ model_config=model_config,
+ cache_config=cache_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ speculative_config=speculative_config,
+ load_config=load_config,
+ prompt_adapter_config=prompt_adapter_config,
+ observability_config=self.observability_config,
+ )
+
+ if not self.model_config.embedding_mode:
+ self._initialize_kv_caches()
+
+ # If usage stat is enabled, collect relevant info.
+ if is_usage_stats_enabled():
+ from vllm.model_executor.model_loader import get_architecture_class_name
+
+ usage_message.report_usage(
+ get_architecture_class_name(model_config),
+ usage_context,
+ extra_kvs={
+ # Common configuration
+ "dtype": str(model_config.dtype),
+ "tensor_parallel_size": parallel_config.tensor_parallel_size,
+ "block_size": cache_config.block_size,
+ "gpu_memory_utilization": cache_config.gpu_memory_utilization,
+ # Quantization
+ "quantization": model_config.quantization,
+ "kv_cache_dtype": str(cache_config.cache_dtype),
+ # Feature flags
+ "enable_lora": bool(lora_config),
+ "enable_prompt_adapter": bool(prompt_adapter_config),
+ "enable_prefix_caching": cache_config.enable_prefix_caching,
+ "enforce_eager": model_config.enforce_eager,
+ "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
+ },
+ )
+
+ if self.tokenizer:
+ # Ping the tokenizer to ensure liveness if it runs in a
+ # different process.
+ self.tokenizer.ping()
+
+ self.cached_scheduler_outputs = [
+ SchedulerOutputState() for _ in range(self.parallel_config.pipeline_parallel_size)
+ ]
+
+ self.scheduler_contexts = [
+ SchedulerContext(multi_step_stream_outputs=self.scheduler_config.multi_step_stream_outputs)
+ for _ in range(self.parallel_config.pipeline_parallel_size)
+ ]
+
+ if model_config.use_async_output_proc:
+ process_model_outputs = weak_bind(self._process_model_outputs)
+
+ self.async_callbacks = [
+ partial(process_model_outputs, ctx=self.scheduler_contexts[v_id])
+ for v_id in range(self.parallel_config.pipeline_parallel_size)
+ ]
+ else:
+ self.async_callbacks = []
+
+ # Currently used by AsyncLLMEngine to ensure quick append
+ # of request outputs to asyncio queues
+ self.process_request_outputs_callback: Optional[Callable] = None
+
+ # Create the scheduler.
+ # NOTE: the cache_config here have been updated with the numbers of
+ # GPU and CPU blocks, which are profiled in the distributed executor.
+ self.scheduler = [
+ Scheduler(
+ scheduler_config,
+ cache_config,
+ lora_config,
+ parallel_config.pipeline_parallel_size,
+ self.async_callbacks[v_id] if model_config.use_async_output_proc else None,
+ ) for v_id in range(parallel_config.pipeline_parallel_size)
+ ]
+
+ # Metric Logging.
+ if self.log_stats:
+ if stat_loggers is not None:
+ self.stat_loggers = stat_loggers
+ else:
+ # Lazy import for prometheus multiprocessing.
+ # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
+ # before prometheus_client is imported.
+ # See https://prometheus.github.io/client_python/multiprocess/
+ from vllm.engine.metrics import LoggingStatLogger, PrometheusStatLogger
+
+ self.stat_loggers = {
+ "logging":
+ LoggingStatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
+ "prometheus":
+ PrometheusStatLogger(
+ local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+ labels=dict(model_name=model_config.served_model_name),
+ max_model_len=self.model_config.max_model_len,
+ ),
+ }
+ self.stat_loggers["prometheus"].info("cache_config", self.cache_config)
+
+ self.tracer = None
+ if self.observability_config.otlp_traces_endpoint:
+ self.tracer = init_tracer("vllm.llm_engine", self.observability_config.otlp_traces_endpoint)
+
+ # Create sequence output processor, e.g. for beam search or
+ # speculative decoding.
+ self.output_processor = SequenceGroupOutputProcessor.create_output_processor(
+ self.scheduler_config,
+ self.detokenizer,
+ self.scheduler,
+ self.seq_counter,
+ get_tokenizer_for_seq,
+ stop_checker=StopChecker(
+ self.scheduler_config.max_model_len,
+ get_tokenizer_for_seq,
+ ),
+ )
+
+ # TODO(sgm): add for verl but we may not tokenizer in Rollout
+ def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
+ init_kwargs = dict(enable_lora=bool(self.lora_config),
+ max_num_seqs=self.scheduler_config.max_num_seqs,
+ max_input_length=None)
+ init_kwargs.update(tokenizer_init_kwargs)
+ return TokenizerGroup(tokenizer, **init_kwargs)
+
+ def init_cache_engine(self):
+ # TODO: check whether we should rebuild the CUDAGraph every iter when offload/load KVCache
+ # Re-capture CUDAGraph would be time-consuming
+ self.model_executor.init_cache_engine()
+
+ def free_cache_engine(self):
+ self.model_executor.free_cache_engine()
+
+ # NOTE(sgm): currently, we only support GPU executor
+ # The GPUExecutor remove the Ray dependency
+ @classmethod
+ def _get_executor_cls(cls, engine_config: EngineConfig) -> Type[ExecutorBase]:
+ distributed_executor_backend = engine_config.parallel_config.distributed_executor_backend
+ # Initialize the cluster and specify the executor class.]
+ assert (engine_config.device_config.device_type == "cuda"
+ ), "Currently, the vllm in verl only support running on GPU"
+
+ # print('Waiting for debugger'); import os,debugpy; debugpy.listen(('localhost', 5678 + int(os.getenv('RANK', '0')))); debugpy.wait_for_client()
+ if engine_config.parallel_config.world_size == 1:
+ engine_config.load_config.load_format = "dummy_hf"
+
+ from .spmd_gpu_executor import SPMDGPUExecutor
+
+ executor_class = SPMDGPUExecutor
+
+ return executor_class
+
+ @classmethod
+ def from_engine_args(
+ cls,
+ model,
+ tokenizer,
+ engine_args: EngineArgs,
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+ ) -> "LLMEngine":
+ """Creates an LLM engine from the engine arguments."""
+ # Create the engine configs.
+ engine_config = engine_args.create_engine_config()
+ executor_class = cls._get_executor_cls(engine_config)
+ # Initialize the cluster and specify the executor class.
+ assert (engine_config.device_config.device_type == "cuda"
+ ), "Currently, the vllm in verl only support running on GPU"
+
+ from .spmd_gpu_executor import SPMDGPUExecutor
+
+ executor_class = SPMDGPUExecutor
+
+ # Create the LLM engine.
+ engine = cls(
+ model,
+ tokenizer,
+ **engine_config.to_dict(),
+ executor_class=executor_class,
+ log_stats=not engine_args.disable_log_stats,
+ usage_context=usage_context,
+ stat_loggers=stat_loggers,
+ )
+ return engine
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.model_executor.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+ def offload_model_weights(self) -> None:
+ self.model_executor.offload_model_weights()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd6c0e624f7f51f9a16b7d5e8059aa1dbef905b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py
@@ -0,0 +1,308 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
+
+from typing import Dict
+
+import torch
+import torch.nn as nn
+from vllm.model_executor.layers.linear import *
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, VocabParallelEmbedding
+from vllm.model_executor.models import ModelRegistry
+
+
+# NOTE(shengguangming): replace the origin weight loader function in the class
+def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Parallel Linear weight loader."""
+ assert (param.size() == loaded_weight.size(
+ )), "the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}".format(
+ param.size(), loaded_weight.size())
+ assert (param.data.dtype == loaded_weight.data.dtype
+ ), "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+ """Default weight loader."""
+ assert param.size() == loaded_weight.size()
+ assert (param.data.dtype == loaded_weight.data.dtype
+ ), "if we want to shared weights, the data type should also be the same"
+
+ param.data = loaded_weight.data
+
+
+def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in actor_weights.items():
+ if "lm_head.weight" in name:
+ # GPT-2 ties the weights of the embedding layer and the final
+ # linear layer.
+ continue
+ if ".attn.bias" in name or ".attn.masked_bias" in name:
+ # Skip attention mask.
+ # NOTE: "c_attn.bias" should not be skipped.
+ continue
+ if not name.startswith("transformer."):
+ name = "transformer." + name
+ param = params_dict[name]
+ # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+ # Because of this, we need to transpose the weights.
+ # Note(zhuohan): the logic below might break quantized models.
+ for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+ if conv1d_weight_name not in name:
+ continue
+ if not name.endswith(".weight"):
+ continue
+ # TODO: check megatron
+ loaded_weight = loaded_weight.t()
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+ ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", "self_attn.o_proj"),
+ ("pre_mlp_layernorm", "post_attention_layernorm"),
+ ("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight"),
+ ("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias"),
+ ("mlp.linear_fc1", "mlp.gate_up_proj"),
+ ("mlp.linear_fc2", "mlp.down_proj"),
+ ("decoder.final_layernorm", "model.norm"),
+ ("output_layer", "lm_head"),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", "self_attn.o_proj"),
+ (
+ "input_layernorm",
+ "input_layernorm",
+ ),
+ ("pre_mlp_layernorm", "post_attention_layernorm"),
+ ("mlp.linear_fc1", "mlp.gate_up_proj"),
+ ("mlp.linear_fc2", "mlp.down_proj"),
+ ("decoder.final_layernorm", "model.norm"),
+ ("output_layer", "lm_head"),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def _replace_name(megatron_name, name_mapping):
+ for m_name, v_name in name_mapping:
+ if m_name not in megatron_name:
+ continue
+ if "layers" in megatron_name: # deal with decoder layers
+ megatron_name = megatron_name.replace("decoder", "model")
+ megatron_name_list = megatron_name.split(".")
+ if "layer_norm_weight" in megatron_name_list or "layer_norm_bias" in megatron_name_list:
+ param_name_list = megatron_name_list[:3]
+ param_name_list.append(v_name)
+ param_name = ".".join(param_name_list)
+ else:
+ param_name_list = megatron_name_list[:3]
+ weight_or_bias = megatron_name_list[-1]
+ param_name_list.append(v_name)
+ param_name_list.append(weight_or_bias)
+ param_name = ".".join(param_name_list)
+ return param_name
+ else:
+ param_name = megatron_name.replace(m_name, v_name)
+ return param_name
+
+
+def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+ ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", "self_attn.o_proj"),
+ ("pre_mlp_layernorm", "post_attention_layernorm"),
+ ("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight"),
+ ("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias"),
+ ("mlp.linear_fc1", "mlp.gate_up_proj"),
+ ("mlp.linear_fc2", "mlp.down_proj"),
+ ("decoder.final_layernorm", "model.norm"),
+ ("output_layer", "lm_head"),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ params_mapping = [
+ # (megatron core gpt model name, vllm model name)
+ ("embedding.word_embeddings", "model.embed_tokens"),
+ ("self_attention.linear_qkv", "self_attn.qkv_proj"),
+ ("self_attention.linear_proj", "self_attn.o_proj"),
+ (
+ "input_layernorm",
+ "input_layernorm",
+ ),
+ ("pre_mlp_layernorm", "post_attention_layernorm"),
+ ("mlp.linear_fc1", "mlp.gate_up_proj"),
+ ("mlp.linear_fc2", "mlp.down_proj"),
+ ("decoder.final_layernorm", "model.norm"),
+ ("output_layer", "lm_head"),
+ ]
+ # NOTE(shengguangming): the megatron llama may have this prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ name = _replace_name(name, params_mapping)
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+def _replace_name(megatron_name, name_mapping):
+ for m_name, v_name in name_mapping:
+ if m_name not in megatron_name:
+ continue
+ if "layers" in megatron_name: # deal with decoder layers
+ megatron_name = megatron_name.replace("decoder", "model")
+ megatron_name_list = megatron_name.split(".")
+ if "layer_norm_weight" in megatron_name_list or "layer_norm_bias" in megatron_name_list:
+ param_name_list = megatron_name_list[:3]
+ param_name_list.append(v_name)
+ param_name = ".".join(param_name_list)
+ else:
+ param_name_list = megatron_name_list[:3]
+ weight_or_bias = megatron_name_list[-1]
+ param_name_list.append(v_name)
+ param_name_list.append(weight_or_bias)
+ param_name = ".".join(param_name_list)
+ return param_name
+ else:
+ param_name = megatron_name.replace(m_name, v_name)
+ return param_name
+
+
+def mistral_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ # TODO: need to implement a general way to deal with prefix
+ params_dict = dict(vllm_model.named_parameters())
+ for name, loaded_weight in actor_weights.items():
+ if "rotary_emb.inv_freq" in name:
+ continue
+ else:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
+ weight_loader(param, loaded_weight)
+
+
+__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__ = {
+ ColumnParallelLinear: parallel_weight_loader,
+ MergedColumnParallelLinear: parallel_weight_loader,
+ QKVParallelLinear: parallel_weight_loader,
+ RowParallelLinear: parallel_weight_loader,
+ VocabParallelEmbedding: parallel_weight_loader,
+ ParallelLMHead: parallel_weight_loader,
+ # "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
+ # "default_weight_loader": default_weight_loader
+}
+
+# for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
+# # setattr(layer_class, 'megatron_weight_loader', weight_loader)
+# layer_class.weight_loader = weight_loader
+
+__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__ = {
+ "GPT2LMHeadModel": gpt2_weight_loader,
+ "LlamaForCausalLM": llama_megatron_weight_loader, # use te backend for open-source megatron
+ "LLaMAForCausalLM": llama_megatron_weight_loader,
+ "MistralForCausalLM": mistral_megatron_weight_loader,
+}
+
+
+# the actor model is .state_dict()
+# Load megatron weights
+def load_megatron_weights(actor_weights: Dict, vllm_model: nn.Module):
+ weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
+ weight_loader(actor_weights, vllm_model)
+ # NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
+ # after init, and we need this after sync model weights for in first iter.
+ vllm_model = vllm_model.cuda()
+
+
+def _get_model_weight_loader(arch: str):
+ if arch in __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__:
+ return __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__[arch]
+ raise ValueError(f"Model architectures {arch} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def update_megatron_weight_loader():
+ for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
+ layer_class.weight_loader = weight_loader
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/model_loader.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/model_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..f146a0eae22563650ec87bc7e5ad3ce2c19e9398
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/model_loader.py
@@ -0,0 +1,338 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
+"""Utilities for selecting and loading models."""
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from vllm.config import CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.model_executor.model_loader import BaseModelLoader
+from vllm.model_executor.model_loader.loader import _initialize_model
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+
+from .config import LoadConfig, LoadFormat, ModelConfig
+from .dtensor_weight_loaders import load_dtensor_weights, update_dtensor_weight_loader
+from .hf_weight_loader import update_hf_weight_loader
+from .megatron_weight_loaders import load_megatron_weights, update_megatron_weight_loader
+
+
+def get_model(
+ actor_model: Union[PreTrainedModel, Dict],
+ model_config: ModelConfig,
+ load_config: LoadConfig,
+ device_config: DeviceConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ lora_config: Optional[LoRAConfig],
+ cache_config: CacheConfig = None,
+) -> nn.Module:
+ loader = get_model_loader(load_config)
+ if load_config.load_format.startswith("dummy"):
+ return loader.load_model(
+ model_config=model_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ cache_config=cache_config,
+ )
+ else:
+ return loader.load_model(
+ actor_model=actor_model,
+ model_config=model_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config,
+ cache_config=cache_config,
+ )
+
+
+def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+ """Get a model loader based on the load format."""
+
+ if isinstance(load_config.load_format, type):
+ return load_config.load_format(load_config)
+
+ if load_config.load_format == LoadFormat.AUTO:
+ update_megatron_weight_loader()
+ return MegatronLoader(load_config)
+
+ # NOTE(sgm): change the weight_loader function in runtime
+ if load_config.load_format == LoadFormat.MEGATRON:
+ update_megatron_weight_loader()
+ return MegatronLoader(load_config)
+
+ if load_config.load_format == LoadFormat.HF:
+ update_hf_weight_loader()
+ return HFLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DTENSOR:
+ update_dtensor_weight_loader()
+ return DTensorLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_HF:
+ update_hf_weight_loader()
+ return DummyModelLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_MEGATRON:
+ update_megatron_weight_loader()
+ return DummyModelLoader(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY_DTENSOR:
+ update_dtensor_weight_loader()
+ return DummyModelLoader(load_config)
+
+ raise ValueError("load format not supported in verl: {}, only support {} and {}".format(
+ load_config.load_format, LoadFormat.MEGATRON, LoadFormat.HF))
+
+
+class DummyModelLoader(BaseModelLoader):
+ """Model loader that will set model weights to random values."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def download_model(self, model_config: ModelConfig) -> None:
+ pass
+
+ def load_model(
+ self,
+ *,
+ model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ cache_config: CacheConfig,
+ ) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, cache_config, scheduler_config)
+ # NOTE(woosuk): For accurate performance evaluation, we assign
+ # random values to the weights.
+ # initialize_dummy_weights(model)
+ return model.eval()
+
+
+class MegatronLoader(BaseModelLoader):
+ """Model loader that can load the model weights from partitioned megatron model."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def download_model(self, model_config: ModelConfig) -> None:
+ pass # Nothing to download
+
+ def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
+ # NOTE(shengguangming) Load the weights from the actor model
+ pass
+ # if isinstance(actor_model, nn.Module):
+ # load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
+ # else:
+ # load_weights(actor_weights=actor_model, vllm_model=model)
+ # return actor_model
+
+ def load_model(
+ self,
+ actor_model: Union[PreTrainedModel, Dict],
+ model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ cache_config: CacheConfig,
+ ) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, cache_config, scheduler_config)
+
+ # TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
+ if isinstance(actor_model, nn.Module):
+ load_megatron_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
+ vllm_model=model)
+ else:
+ load_megatron_weights(actor_weights=actor_model, vllm_model=model)
+
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+class HFLoader(BaseModelLoader):
+ """Model loader that can load the model weights from model's full params."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def download_model(self, model_config: ModelConfig) -> None:
+ pass # Nothing to download
+
+ def _get_weights_iterator(self, actor_model: Union[PreTrainedModel, Dict]):
+ if isinstance(actor_model, Dict):
+ return actor_model.items()
+ elif isinstance(actor_model, nn.Module):
+ return dict(actor_model.named_parameters()).items()
+ else:
+ raise ValueError(f"actor model should be Dict or nn.Module, but get {type(actor_model)}")
+
+ def load_model(
+ self,
+ actor_model: Union[PreTrainedModel, Dict],
+ model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ cache_config: CacheConfig,
+ ) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ # with torch.device(device_config.device):
+ # NOTE(sgm): init the model in cpu
+ model = _initialize_model(model_config, self.load_config, lora_config, cache_config, scheduler_config)
+ model.load_weights(self._get_weights_iterator(actor_model))
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+class DTensorLoader(BaseModelLoader):
+ """Model loader that can load the model weights from partitioned megatron model."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def download_model(self, model_config: ModelConfig) -> None:
+ pass # Nothing to download
+
+ def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
+ # NOTE(shengguangming) Load the weights from the actor model
+ pass
+ # if isinstance(actor_model, nn.Module):
+ # load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
+ # else:
+ # load_weights(actor_weights=actor_model, vllm_model=model)
+ # return actor_model
+
+ def load_model(
+ self,
+ actor_model: Union[PreTrainedModel, Dict],
+ model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ cache_config: CacheConfig,
+ ) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config, lora_config, cache_config, scheduler_config)
+
+ # TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
+ if isinstance(actor_model, nn.Module):
+ load_dtensor_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
+ vllm_model=model)
+ else:
+ load_dtensor_weights(actor_weights=actor_model, vllm_model=model)
+
+ for _, module in model.named_modules():
+ quant_method = getattr(module, "quant_method", None)
+ if quant_method is not None:
+ quant_method.process_weights_after_loading(module)
+ # FIXME: Remove this after Mixtral is updated
+ # to use quant_method.
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ # NOTE(sgm) Some weights are point to gpu, but still need this.
+ model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
+ return model.eval()
+
+
+# FIXME(sgm): hack the _get_logits function in vllm v0.4.2
+# as they use ray, the _get_logits result will only need to return to the driver node,
+# therefore gather is enough. However, we use SPMD instead of a central scheduler,
+# all_gather is required (aligned with v0.2.6)
+def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
+ embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
+ # Get the logits for the next tokens.
+ logits = torch.matmul(hidden_states, embedding.t())
+ if embedding_bias is not None:
+ logits += embedding_bias
+ logits = tensor_model_parallel_all_gather(logits)
+ # Remove paddings in vocab (if any).
+ if logits is not None:
+ logits = logits[:, :self.org_vocab_size]
+ return logits
+
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+
+
+def logitsprocessor_init(
+ self,
+ vocab_size: int,
+ org_vocab_size: Optional[int] = None,
+ scale: float = 1.0,
+ logits_as_input: bool = False,
+ soft_cap: Optional[float] = None,
+) -> None:
+ """
+ Args:
+ scale: A scaling factor to apply to the logits.
+ """
+ super(LogitsProcessor, self).__init__()
+ self.scale = scale
+ self.vocab_size = vocab_size
+ # Whether the input is logits (default is hidden states).
+ self.logits_as_input = logits_as_input
+ # original vocabulary size (without LoRA).
+ self.org_vocab_size = org_vocab_size or vocab_size
+ # Soft cap the logits. Used in Gemma 2.
+ self.soft_cap = soft_cap
+ # Whether to use gather or all-gather to gather the logits.
+ self.use_gather = False
+
+
+LogitsProcessor.__init__ = logitsprocessor_init # use all_gather
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/model_runner.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/model_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0cceffb52fd29ae02466b3eec51faaf0bda2bfb
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/model_runner.py
@@ -0,0 +1,182 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
+
+import warnings
+from enum import IntEnum
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn as nn
+import vllm.envs as envs
+from vllm.compilation.levels import CompilationLevel
+from vllm.config import (
+ CacheConfig,
+ DeviceConfig,
+ LoadConfig,
+ LoRAConfig,
+ ModelConfig,
+ ObservabilityConfig,
+ ParallelConfig,
+ PromptAdapterConfig,
+ SchedulerConfig,
+)
+from vllm.inputs import INPUT_REGISTRY, InputRegistry
+from vllm.logger import init_logger
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor.models.interfaces import supports_lora
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.prompt_adapter.worker_manager import LRUCacheWorkerPromptAdapterManager
+from vllm.utils import DeviceMemoryProfiler, is_hip, supports_dynamo
+from vllm.worker.model_runner import ModelRunner
+
+from .config import LoadConfig, ModelConfig
+from .model_loader import get_model
+
+logger = init_logger(__name__)
+
+
+# How batches are constructed.
+class BatchType(IntEnum):
+ # Every batch is prefill.
+ PREFILL = 0
+ # Every batch is decode.
+ DECODE = 1
+ # Batch is a mixture of prefill and decode.
+ MIXED = 2
+
+
+class ModelRunner(ModelRunner):
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # [verl] model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ cache_config: CacheConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ kv_cache_dtype: Optional[str] = "auto",
+ is_driver_worker: bool = False,
+ prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+ return_hidden_states: bool = False,
+ observability_config: Optional[ObservabilityConfig] = None,
+ input_registry: InputRegistry = INPUT_REGISTRY,
+ mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+ ):
+
+ super().__init__(
+ model_config,
+ parallel_config,
+ scheduler_config,
+ device_config,
+ cache_config,
+ load_config,
+ lora_config,
+ kv_cache_dtype,
+ is_driver_worker=True, # a hack
+ prompt_adapter_config=prompt_adapter_config,
+ return_hidden_states=return_hidden_states,
+ observability_config=observability_config,
+ input_registry=input_registry,
+ mm_registry=mm_registry,
+ )
+
+ # NOTE(sgm): add for verl
+ self.model = model # this will be replaced by get_model()
+
+ def load_model(self) -> None:
+ logger.info("Starting to load model %s...", self.model_config.model)
+ with DeviceMemoryProfiler() as m:
+ self.model = get_model(
+ self.model,
+ model_config=self.model_config,
+ device_config=self.device_config,
+ load_config=self.load_config,
+ lora_config=self.lora_config,
+ parallel_config=self.parallel_config,
+ scheduler_config=self.scheduler_config,
+ cache_config=self.cache_config,
+ )
+
+ self.model_memory_usage = m.consumed_memory
+ logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30))
+
+ if self.lora_config:
+ assert supports_lora(self.model), f"{self.model.__class__.__name__} does not support LoRA yet."
+
+ if supports_multimodal(self.model):
+ logger.warning("Regarding multimodal models, vLLM currently "
+ "only supports adding LoRA to language model.")
+ # It's necessary to distinguish between the max_position_embeddings
+ # of VLMs and LLMs.
+ if hasattr(self.model.config, "max_position_embeddings"):
+ max_pos_embeddings = self.model.config.max_position_embeddings
+ else:
+ max_pos_embeddings = self.model.config.text_config.max_position_embeddings
+
+ self.lora_manager = LRUCacheWorkerLoRAManager(
+ self.scheduler_config.max_num_seqs,
+ self.scheduler_config.max_num_batched_tokens,
+ self.vocab_size,
+ self.lora_config,
+ self.device,
+ self.model.embedding_modules,
+ self.model.embedding_padding_modules,
+ max_position_embeddings=max_pos_embeddings,
+ )
+ self.model = self.lora_manager.create_lora_manager(self.model)
+
+ if self.prompt_adapter_config:
+ self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
+ self.scheduler_config.max_num_seqs,
+ self.scheduler_config.max_num_batched_tokens,
+ self.device,
+ self.prompt_adapter_config,
+ )
+ self.model = self.prompt_adapter_manager.create_prompt_adapter_manager(self.model)
+
+ if self.kv_cache_dtype == "fp8" and is_hip():
+ # Currently only ROCm accepts kv-cache scaling factors
+ # via quantization_param_path and this will be deprecated
+ # in the future.
+ if self.model_config.quantization_param_path is not None:
+ if callable(getattr(self.model, "load_kv_cache_scales", None)):
+ warnings.warn(
+ "Loading kv cache scaling factor from JSON is "
+ "deprecated and will be removed. Please include "
+ "kv cache scaling factors in the model checkpoint.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ self.model.load_kv_cache_scales(self.model_config.quantization_param_path)
+ logger.info("Loaded KV cache scaling factors from %s", self.model_config.quantization_param_path)
+ else:
+ raise RuntimeError(
+ "Using FP8 KV cache and scaling factors provided but "
+ "model %s does not support loading scaling factors.",
+ self.model.__class__,
+ )
+ else:
+ logger.warning("Using FP8 KV cache but no scaling factors "
+ "provided. Defaulting to scaling factors of 1.0. "
+ "This may lead to less accurate results!")
+
+ if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
+ from vllm.plugins import get_torch_compile_backend
+
+ backend = get_torch_compile_backend() or "eager"
+ self.model = torch.compile(self.model, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend)
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/parallel_state.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/parallel_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..0150c1c678e43dc5a6cb3f4426b5854ab45d8e4a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/parallel_state.py
@@ -0,0 +1,312 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""Model and data parallel groups."""
+import os
+from typing import Optional
+
+import torch
+import torch.distributed
+import vllm.distributed.parallel_state as ps
+from vllm.distributed.parallel_state import (
+ get_pp_group,
+ get_world_group,
+ init_distributed_environment,
+ init_model_parallel_group,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+"""
+This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
+- We assume the Megatron tp+dp+pp world is already established before calling this function.
+
+"""
+
+# Device mesh for using DTensor
+_DEVICE_MESH = None
+
+# Tensor model parallel group that the current rank belongs to.
+_TP = None
+# Pipeline model parallel group that the current rank belongs to.
+_PP = None
+
+
+# This method is for initializing the ParallelGroup when using HybridEngine
+def initialize_parallel_state(
+ distributed_init_method: str = "env://",
+ backend: str = "nccl",
+ tensor_model_parallel_size: int = 1,
+ num_tp_per_train_tp: int = 1,
+ pipeline_model_parallel_size: int = 1,
+):
+ # torch.distributed.all_reduce does not free the input tensor until
+ # the synchronization point. This causes the memory usage to grow
+ # as the number of all_reduce calls increases. This env var disables
+ # this behavior.
+ # Related issue:
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+ # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
+ rank = int(os.getenv("RANK", "-1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+ # Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
+ if torch.distributed.get_world_size() > 1:
+ # NOTE: build a sepearate inference group with infer tp & micro dp
+ initialize_model_parallel_for_vllm(
+ tensor_model_parallel_size=tensor_model_parallel_size,
+ num_tensor_model_parallel_groups_per_train_tp=num_tp_per_train_tp,
+ )
+ else:
+ initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
+
+
+def ensure_model_parallel_initialized(
+ tensor_model_parallel_size: int,
+ pipeline_model_parallel_size: int = 1,
+ backend: Optional[str] = None,
+) -> None:
+ """Helper to initialize model parallel groups if they are not initialized,
+ or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+ values if the model parallel groups are initialized.
+ """
+ # get the backend of _DEVICE_WORLD_GROUP
+ backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+ if not model_parallel_is_initialized():
+ initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
+ return
+
+ assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
+ "tensor parallel group already initialized, but of unexpected size: "
+ f"{get_tensor_model_parallel_world_size()=} vs. "
+ f"{tensor_model_parallel_size=}")
+ pp_world_size = get_pp_group().world_size
+ assert pp_world_size == pipeline_model_parallel_size, (
+ "pipeline parallel group already initialized, but of unexpected size: "
+ f"{pp_world_size=} vs. "
+ f"{pipeline_model_parallel_size=}")
+
+
+# TODO(sgm): deviate from the v0.5.4, not pp now
+def model_parallel_is_initialized():
+ """Check if tensor and pipeline parallel groups are initialized."""
+ return ps._TP is not None
+ # and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
+
+
+def initialize_model_parallel_for_vllm(
+ tensor_model_parallel_size: int,
+ num_tensor_model_parallel_groups_per_train_tp: int = 1,
+ pipeline_model_parallel_size: int = 1,
+) -> None:
+ pass
+
+ # Get world size and rank. Ensure some consistencies.
+ assert torch.distributed.is_initialized()
+
+ assert isinstance(tensor_model_parallel_size, int)
+
+ # assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group
+ # assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group
+
+ # Build the tensor model-parallel groups.
+ assert ps._TP is None, "tensor model parallel group is already initialized"
+
+ global _TP
+
+ world_size: int = torch.distributed.get_world_size()
+
+ rank = torch.distributed.get_rank()
+
+ backend = torch.distributed.get_backend()
+
+ num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+
+ if num_tensor_model_parallel_groups_per_train_tp == 1:
+ # if tensor_model_parallel_size == train_tensor_parallel_size:
+ # using the same tp group as Megatron/vllm
+ assert _TP is None, "tensor model parallel group is already initialized"
+ group_ranks = []
+ for i in range(num_tensor_model_parallel_groups):
+ ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+ group_ranks.append(ranks)
+ _TP = init_model_parallel_group(
+ group_ranks=group_ranks,
+ local_rank=get_world_group().local_rank,
+ backend=backend,
+ use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer
+ use_message_queue_broadcaster=True,
+ )
+ ps._TP = _TP
+ # _MICRO_DATA_PARALLEL_GROUP is move to hybrid engine
+ else:
+ # initialize a micro_dp group and a tp group
+ # assume training tp=4, infer tp=2, then, weight is partitioned as
+ # [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
+
+ # Build the inference tp groups
+ # train_tp = train_tensor_parallel_size
+ train_tp = num_tensor_model_parallel_groups_per_train_tp * tensor_model_parallel_size
+ # num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
+ assert _TP is None, "tensor model parallel group is already initialized"
+ group_ranks = []
+ for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
+ start = train_tp * i
+ end = train_tp * (i + 1)
+ for j in range(num_tensor_model_parallel_groups_per_train_tp):
+ ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
+ for i in range(len(ranks)):
+ ranks[i] += j
+ group_ranks.append(ranks)
+ _TP = init_model_parallel_group(
+ group_ranks=group_ranks,
+ local_rank=get_world_group().local_rank,
+ backend=backend,
+ use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer
+ use_message_queue_broadcaster=True,
+ )
+ ps._TP = _TP
+
+ # Build the pipeline model-parallel groups.
+ # global _PIPELINE_MODEL_PARALLEL_GROUP
+ # global _PIPELINE_GLOBAL_RANKS
+ # assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
+
+ # ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
+ # ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
+
+ # TODO: init using device mesh (not support hybrid engine now)
+ # Build the pipeline model-parallel groups.
+ num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+ global _PP
+ assert _PP is None, "pipeline model parallel group is already initialized"
+ group_ranks = []
+ for i in range(num_pipeline_model_parallel_groups):
+ ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
+ group_ranks.append(ranks)
+ # pipeline parallel does not need custom allreduce
+ _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False)
+ ps._PP = _PP # for verl
+
+
+def initialize_model_parallel(
+ tensor_model_parallel_size: int = 1,
+ pipeline_model_parallel_size: int = 1,
+ backend: Optional[str] = None,
+) -> None:
+ """
+ NOTE: This method is a hack from the open-sourced version without
+ asertion of world_size = tp * pp
+
+ Initialize model parallel groups.
+
+ Arguments:
+ tensor_model_parallel_size: number of GPUs used for tensor model
+ parallelism.
+ pipeline_model_parallel_size: number of GPUs used for pipeline model
+ parallelism.
+
+ Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+ use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+ the model pipeline. The present function will
+ create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+ 4 tensor model-parallel groups:
+ [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+ 2 pipeline model-parallel groups:
+ [g0, g2, g4, g6], [g1, g3, g5, g7]
+ Note that for efficiency, the caller should make sure adjacent ranks
+ are on the same DGX box. For example if we are using 2 DGX-1 boxes
+ with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+ ranks 8 to 15 belong to the second box.
+ """
+ # Get world size and rank. Ensure some consistencies.
+ assert torch.distributed.is_initialized()
+ world_size: int = torch.distributed.get_world_size()
+ backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group)
+
+ # NOTE(sgm) we don't assert world_size == tp * pp
+ # DP is not managed by vllm but by the VeRL WorkerGroup
+ # if (world_size !=
+ # tensor_model_parallel_size * pipeline_model_parallel_size):
+ # raise RuntimeError(
+ # f"world_size ({world_size}) is not equal to "
+ # f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+ # f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
+
+ num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+ rank = torch.distributed.get_rank()
+ global _TP
+ assert _TP is None, "tensor model parallel group is already initialized"
+ group_ranks = []
+ for i in range(num_tensor_model_parallel_groups):
+ ranks = list(range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size))
+ group_ranks.append(ranks)
+
+ # message queue broadcaster is only used in tensor model parallel group
+ _TP = init_model_parallel_group(
+ group_ranks,
+ get_world_group().local_rank,
+ backend,
+ use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer
+ use_message_queue_broadcaster=True,
+ )
+ ps._TP = _TP
+
+ # TODO: init using device mesh (not support hybrid engine now)
+ # Build the pipeline model-parallel groups.
+ num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+ global _PP
+ assert _PP is None, "pipeline model parallel group is already initialized"
+ group_ranks = []
+ for i in range(num_pipeline_model_parallel_groups):
+ ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
+ group_ranks.append(ranks)
+ # pipeline parallel does not need custom allreduce
+ _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False)
+ ps._PP = _PP # for verl
+
+
+"""
+Device mesh utilities
+"""
+
+
+def get_device_mesh():
+ assert _DEVICE_MESH is not None, "device mesh is not initialized"
+ return _DEVICE_MESH
+
+
+"""
+Tensor model parallel utilities
+"""
+
+
+def get_tensor_model_parallel_group():
+ """Get the tensor model parallel group the caller rank belongs to."""
+ assert _TP is not None, "tensor model parallel group is not initialized"
+ return _TP.device_group
+
+
+def get_tensor_model_parallel_world_size():
+ """Return world size for the tensor model parallel group."""
+ return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_rank():
+ """Return my rank for the tensor model parallel group."""
+ return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_tensor_model_parallel_src_rank():
+ """Calculate the global rank corresponding to the first local rank
+ in the tensor model parallel group."""
+ global_rank = torch.distributed.get_rank()
+ local_world_size = get_tensor_model_parallel_world_size()
+ return (global_rank // local_world_size) * local_world_size
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/spmd_gpu_executor.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/spmd_gpu_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..229a424c840226e2f6c148418d7c69a97807afa1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/spmd_gpu_executor.py
@@ -0,0 +1,256 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/executor/gpu_executor.py
+
+import os
+import socket
+from typing import Dict, List, Optional, Set, Tuple
+
+import torch
+from vllm.config import (
+ CacheConfig,
+ DeviceConfig,
+ LoRAConfig,
+ ObservabilityConfig,
+ ParallelConfig,
+ PromptAdapterConfig,
+ SchedulerConfig,
+ SpeculativeConfig,
+)
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
+
+from .config import LoadConfig, ModelConfig
+
+logger = init_logger(__name__)
+
+
+class SPMDGPUExecutor(ExecutorBase):
+ """SPMD-based multi-GPU executor implementations."""
+
+ def __init__(
+ self,
+ model, # pytorch model itself or its parameter dict
+ model_config: ModelConfig,
+ cache_config: CacheConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ speculative_config: Optional[SpeculativeConfig],
+ prompt_adapter_config: Optional[PromptAdapterConfig],
+ observability_config: Optional[ObservabilityConfig],
+ ) -> None:
+ self.model_config = model_config
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ self.load_config = load_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.speculative_config = speculative_config
+ self.prompt_adapter_config = prompt_adapter_config
+ self.observability_config = observability_config
+
+ distributed_init_method = initialize_cluster(parallel_config)
+ self._init_executor(model, distributed_init_method)
+
+ # TODO(sgm): verl not support speculative decode now
+ def _init_executor(self, model, distributed_init_method) -> None:
+ assert not self.speculative_config, "Speculative decoding not yet supported for multi-GPU backend."
+
+ # Create the parallel worker for each GPU.
+ self._init_workers_sp(model, distributed_init_method)
+
+ def _init_workers_sp(self, model, distributed_init_method: str):
+ # Lazy import the Worker to avoid importing torch.cuda/xformers
+ # before CUDA_VISIBLE_DEVICES is set in the Worker
+ from .worker import Worker # pylint: disable=import-outside-toplevel
+
+ rank = int(os.getenv("RANK"))
+ local_rank = int(os.getenv("LOCAL_RANK"))
+ print(f"local rank {local_rank}")
+
+ # see https://github.com/NVIDIA/nccl/issues/1234
+ os.environ["NCCL_CUMEM_ENABLE"] = "0"
+
+ self.worker = Worker(
+ model,
+ self.model_config,
+ self.parallel_config,
+ self.scheduler_config,
+ self.device_config,
+ self.cache_config,
+ self.load_config,
+ local_rank,
+ rank,
+ distributed_init_method,
+ lora_config=self.lora_config,
+ speculative_config=None,
+ prompt_adapter_config=self.speculative_config,
+ is_driver_worker=True,
+ model_runner_cls=None, # use the default one
+ )
+
+ # NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
+ self.worker.init_device()
+ self.worker.load_model()
+
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
+ """Determine the number of available KV blocks.
+
+ This invokes `determine_num_available_blocks` on each worker and takes
+ the min of the results, guaranteeing that the selected cache sizes are
+ compatible with all workers.
+
+ Returns:
+ - tuple[num_gpu_blocks, num_cpu_blocks]
+ """
+ # Get the maximum number of blocks that can be allocated on GPU and CPU.
+ num_blocks = self.worker.determine_num_available_blocks()
+
+ # NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
+ # have its own scheduler
+ num_gpu_blocks = num_blocks[0]
+ num_cpu_blocks = num_blocks[1]
+
+ return num_gpu_blocks, num_cpu_blocks
+
+ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+ """Initialize the KV cache in all workers."""
+
+ # NOTE: We log here to avoid multiple logs when number of workers is
+ # greater than one. We could log in the engine, but not all executors
+ # have GPUs.
+ logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
+
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+ if torch.distributed.get_rank() == 0:
+ print(
+ f"before init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB"
+ )
+ self.worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks)
+ if torch.distributed.get_rank() == 0:
+ print(
+ f"after init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB"
+ )
+
+ # NOTE(sgm): This will not profile & capture the model(CUDAGraph) when rebuilding KVCache
+ def init_cache_engine(self) -> None:
+ self.worker._init_cache_engine()
+
+ def free_cache_engine(self) -> None:
+ self.worker.free_cache_engine()
+
+ def execute_model(self, execute_model_req) -> List[SamplerOutput]:
+ all_outputs = self.worker.execute_model(execute_model_req=execute_model_req)
+
+ # NOTE(sgm):
+ # Each GPU in vllm under verl has its own spmd_gpu_executor, therefore all GPUs should return the outputs
+ # In vllm with ray, only the driver worker returns the sampling results.
+ return all_outputs
+
+ def add_lora(self, lora_request: LoRARequest) -> bool:
+ assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+ return self.worker.add_lora(lora_request=lora_request)
+
+ def remove_lora(self, lora_id: int) -> bool:
+ assert lora_id > 0, "lora_id must be greater than 0."
+ return self.worker.remove_lora(lora_id=lora_id)
+
+ def list_loras(self) -> Set[int]:
+ return self.worker.list_loras()
+
+ def check_health(self) -> None:
+ # SPMDExecutor will always be healthy as long as
+ # it's running.
+ return
+
+ # NOTE(sgm) add for verl to pass the abstract class test, not used
+ from vllm.prompt_adapter.request import PromptAdapterRequest
+
+ def add_prompt_adapter(self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+ assert prompt_adapter_request.prompt_adapter_id > 0, "prompt_adapter_id must be greater than 0."
+ return self.worker.add_prompt_adapter(prompt_adapter_request)
+
+ def list_prompt_adapters(self) -> Set[int]:
+ return self.worker.list_prompt_adapters()
+
+ def pin_lora(self, lora_id: int) -> bool:
+ assert lora_id > 0, "lora_id must be greater than 0."
+ return self.worker.pin_lora(lora_id)
+
+ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ assert prompt_adapter_id > 0, "prompt_adapter_id must be greater than 0."
+ return self.worker.pin_prompt_adapter(prompt_adapter_id)
+
+ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ assert prompt_adapter_id > 0, "prompt_adapter_id must be greater than 0."
+ return self.worker.remove_prompt_adapter(prompt_adapter_id)
+
+ # NOTE(sgm): add for verl
+ def offload_model_weights(self) -> None:
+ self.worker.offload_model_weights()
+
+ def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
+ self.worker.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
+
+
+def initialize_cluster(
+ parallel_config: ParallelConfig,
+ engine_use_ray: bool = False,
+ ray_address: Optional[str] = None,
+) -> Tuple[str, Optional[None]]:
+ """Initialize the distributed cluster probably with Ray.
+
+ Args:
+ parallel_config: The configurations for parallel execution.
+
+ Returns:
+ The `distributed_init_method` is the address for initializing the
+ distributed backend.
+ """
+
+ # Initialize cluster locally.
+ port = get_open_port()
+ # We need to setup the distributed init method to make sure
+ # the distributed megatron code (e.g., get world size) works correctly.
+ # distributed_init_method = f"tcp://localhost:{port}"
+ distributed_init_method = "env://"
+ return distributed_init_method
+
+
+def get_open_port():
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ s.bind(("", 0))
+ return s.getsockname()[1]
+
+
+# TODO(sgm): not implemented async executor yet
+class SPMDGPUExecutorAsync(SPMDGPUExecutor, ExecutorAsyncBase):
+
+ async def execute_model_async(self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+ """Executes one model step on the given sequences."""
+ raise NotImplementedError
+
+ async def check_health_async(self) -> None:
+ """Checks if the executor is healthy. If not, it should raise an
+ exception."""
+ self.check_health()
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/tokenizer.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b4d0e27c84fc0358411d7bf29e0702aac929b9
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/tokenizer.py
@@ -0,0 +1,40 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+
+from typing import Optional
+
+from transformers import PreTrainedTokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.utils import LRUCache
+
+
+class TokenizerGroup(TokenizerGroup):
+ """A group of tokenizers that can be used for LoRA adapters."""
+
+ def __init__(self, tokenizer: PreTrainedTokenizer, enable_lora: bool, max_num_seqs: int,
+ max_input_length: Optional[int]):
+ self.enable_lora = enable_lora
+ self.max_input_length = max_input_length
+ self.tokenizer = tokenizer
+ self.lora_tokenizers = LRUCache[PreTrainedTokenizer](capacity=max_num_seqs) if enable_lora else None
+
+ # FIXME(sgm): for simplicity, we assign the special token here
+ @property
+ def pad_token_id(self):
+ return self.tokenizer.pad_token_id
+
+ @property
+ def eos_token_id(self):
+ return self.tokenizer.eos_token_id
diff --git a/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/worker.py b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb1a7ab80c7526177cc0e53963f0e2e85d683334
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/third_party/vllm/vllm_v_0_6_3/worker.py
@@ -0,0 +1,333 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023 The vLLM team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
+"""A GPU worker class."""
+import gc
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import torch
+import torch.distributed
+import torch.nn as nn
+from vllm.config import (
+ CacheConfig,
+ DeviceConfig,
+ LoRAConfig,
+ ParallelConfig,
+ PromptAdapterConfig,
+ SchedulerConfig,
+ SpeculativeConfig,
+)
+
+# TODO(sgm): check why vllm has similar file in vllm.model_executor.parallel_utils.parallel_state
+from vllm.distributed import get_tensor_model_parallel_group, init_distributed_environment, set_custom_all_reduce
+from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest, IntermediateTensors
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.embedding_model_runner import EmbeddingModelRunner
+from vllm.worker.model_runner import GPUModelRunnerBase
+from vllm.worker.model_runner_base import ModelRunnerInputBase
+from vllm.worker.worker import Worker, _check_if_gpu_supports_dtype
+from vllm.worker.worker_base import WorkerInput
+
+from .config import LoadConfig, LoadFormat, ModelConfig
+from .dtensor_weight_loaders import load_dtensor_weights
+from .hf_weight_loader import load_hf_weights
+from .megatron_weight_loaders import load_megatron_weights
+from .model_runner import ModelRunner
+from .parallel_state import ensure_model_parallel_initialized
+
+
+class Worker(Worker):
+ """A worker class that executes (a partition of) the model on a GPU.
+
+ Each worker is associated with a single GPU. The worker is responsible for
+ maintaining the KV cache and executing the model on the GPU. In case of
+ distributed inference, each worker is assigned a partition of the model.
+ """
+
+ def __init__(
+ self,
+ model: Union[nn.Module, Dict], # model itself or its parameter dict
+ model_config: ModelConfig,
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig,
+ device_config: DeviceConfig,
+ cache_config: CacheConfig,
+ load_config: LoadConfig,
+ local_rank: int,
+ rank: int,
+ distributed_init_method: str,
+ lora_config: Optional[LoRAConfig] = None,
+ speculative_config: Optional[SpeculativeConfig] = None,
+ prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+ is_driver_worker: bool = False,
+ model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
+ ) -> None:
+ # self.model = model # will be replaced in the init_model
+ self.model_config = model_config
+ self.parallel_config = parallel_config
+ self.parallel_config.rank = rank
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.cache_config = cache_config
+ self.local_rank = local_rank
+ self.rank = rank
+ self.distributed_init_method = distributed_init_method
+ self.lora_config = lora_config
+ self.load_config = load_config
+ self.prompt_adapter_config = prompt_adapter_config
+ self.is_driver_worker = is_driver_worker # TODO: we don't need driver
+ # if parallel_config and is_driver_worker:
+ # assert rank % parallel_config.tensor_parallel_size == 0, \
+ # "Driver worker should be rank 0 of tensor parallel group."
+ if self.model_config.trust_remote_code:
+ # note: lazy import to avoid importing torch before initializing
+ from vllm.utils import init_cached_hf_modules
+
+ init_cached_hf_modules()
+
+ # Return hidden states from target model if the draft model is an
+ # mlp_speculator
+ speculative_args = (
+ {} if speculative_config is None or (speculative_config.draft_model_config.model == model_config.model) or
+ (speculative_config.draft_model_config.hf_config.model_type not in ["medusa", "mlp_speculator"]) else {
+ "return_hidden_states": True
+ })
+
+ # TODO(sgm): set correct model runner class
+ ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+ if model_runner_cls is not None:
+ ModelRunnerClass = model_runner_cls
+ elif self.model_config.embedding_mode:
+ ModelRunnerClass = EmbeddingModelRunner
+ self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
+ model, # [VERL]: add for verl
+ model_config,
+ parallel_config,
+ scheduler_config,
+ device_config,
+ cache_config,
+ load_config=load_config,
+ lora_config=self.lora_config,
+ kv_cache_dtype=self.cache_config.cache_dtype,
+ is_driver_worker=is_driver_worker,
+ prompt_adapter_config=prompt_adapter_config,
+ **speculative_args,
+ )
+
+ # Uninitialized cache engine. Will be initialized by
+ # initialize_cache.
+ self.cache_engine: List[CacheEngine] = None
+ # Initialize gpu_cache as embedding models don't initialize kv_caches
+ self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
+
+ # NOTE(sgm): [VERL] For offloading inference engine params
+ self.cpu_model = None
+
+ def init_device(self) -> None:
+ if self.device_config.device.type == "cuda":
+ # torch.distributed.all_reduce does not free the input tensor until
+ # the synchronization point. This causes the memory usage to grow
+ # as the number of all_reduce calls increases. This env var disables
+ # this behavior.
+ # Related issue:
+ # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+ # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
+ self.rank = self.rank if self.rank is not None else int(os.getenv("RANK", "-1"))
+ local_rank = int(os.getenv("LOCAL_RANK", "0"))
+ self.device = torch.device(f"cuda:{local_rank}")
+ if self.rank < 0:
+ raise ValueError("Invalid or unspecified rank.")
+ torch.cuda.set_device(self.device)
+
+ # Use the world_size set by TORCHRUN
+ world_size = int(os.getenv("WORLD_SIZE", "-1"))
+ assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
+ self.parallel_config.world_size = world_size
+
+ _check_if_gpu_supports_dtype(self.model_config.dtype)
+ torch.cuda.empty_cache()
+ self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+ else:
+ raise RuntimeError(f"Not support device type: {self.device_config.device}")
+
+ # Initialize the distributed environment.
+ init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method,
+ self.local_rank)
+ # Set random seed.
+ set_random_seed(self.model_config.seed)
+ # self.model = get_model(actor_model=self.model, model_config=self.model_config)
+
+ @torch.inference_mode()
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
+ """Profiles the peak memory usage of the model to determine how many
+ KV blocks may be allocated without OOMs.
+
+ The engine will first conduct a profiling of the existing memory usage.
+ Then, it calculate the maximum possible number of GPU and CPU blocks
+ that can be allocated with the remaining free memory.
+
+ .. tip::
+ You may limit the usage of GPU memory
+ by adjusting the `gpu_memory_utilization` parameter.
+ """
+ # Profile the memory usage of the model and get the maximum number of
+ # cache blocks that can be allocated with the remaining free memory.
+ torch.cuda.empty_cache()
+ # torch.cuda.reset_peak_memory_stats()
+
+ # Execute a forward pass with dummy inputs to profile the memory usage
+ # of the model.
+ self.model_runner.profile_run()
+
+ # Calculate the number of blocks that can be allocated with the
+ # profiled peak memory.
+ torch.cuda.synchronize()
+ free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+ peak_memory = total_gpu_memory - free_gpu_memory
+
+ assert peak_memory > 0, ("Error in memory profiling. This happens when the GPU memory was "
+ "not properly cleaned up before initializing the vLLM instance.")
+
+ cache_block_size = self.get_cache_block_size_bytes()
+
+ # NOTE(sgm) [VERL] use the remaining memory
+ num_gpu_blocks = int((free_gpu_memory * self.cache_config.gpu_memory_utilization) // cache_block_size)
+ # num_gpu_blocks = int((total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // cache_block_size)
+
+ num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size)
+ num_gpu_blocks = max(num_gpu_blocks, 0)
+ num_cpu_blocks = max(num_cpu_blocks, 0)
+ if self.model_runner.lora_manager:
+ self.model_runner.remove_all_loras()
+
+ # NOTE(sgm): Add for [VERL], synchronize number of blocks with all the rank
+ num_gpu_blocks = torch.tensor([num_gpu_blocks], device="cuda")
+ num_cpu_blocks = torch.tensor([num_cpu_blocks], device="cuda")
+
+ torch.distributed.all_reduce(num_gpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group().device_group)
+ torch.distributed.all_reduce(num_cpu_blocks,
+ op=torch.distributed.ReduceOp.MIN,
+ group=get_tensor_model_parallel_group().device_group)
+ num_gpu_blocks = num_gpu_blocks.item()
+ num_cpu_blocks = num_cpu_blocks.item()
+ gc.collect()
+ torch.cuda.empty_cache()
+ return num_gpu_blocks, num_cpu_blocks
+
+ def _init_cache_engine(self):
+ if self.cache_engine is None and self.gpu_cache is None:
+ super()._init_cache_engine()
+
+ def free_cache_engine(self):
+ # ensure `enforce_eager=True`
+ self.cache_engine = None
+ self.gpu_cache = None
+
+ # NOTE(sgm): [VERL]: adapt from _execute_model_spmd()
+ def execute_model(self,
+ execute_model_req: ExecuteModelRequest,
+ intermediate_tensors: Optional[IntermediateTensors] = None) -> Optional[List[SamplerOutput]]:
+ """
+ Execute model in Single Program Multiple Data (SPMD) fashion.
+ All workers take the same request, prepare the input and
+ execute the model.
+ """
+ assert execute_model_req is not None, ("_execute_model_spmd() requires each worker to take in an "
+ "ExecuteModelRequest")
+ worker_input: WorkerInput = self.prepare_worker_input(execute_model_req=execute_model_req)
+ model_input: ModelRunnerInputBase = self.model_runner.prepare_model_input(
+ execute_model_req.seq_group_metadata_list)
+
+ # verl.worker.workerbase.WorkerBase
+ # swap cache
+ super().execute_worker(worker_input)
+
+ # If there is no input, we don't need to execute the model.
+ if worker_input.num_seq_groups == 0:
+ return []
+
+ return self.model_runner.execute_model(
+ model_input,
+ self.kv_cache[worker_input.virtual_engine] if self.kv_cache is not None else None,
+ intermediate_tensors,
+ )
+
+ # assume the input is .state_dict()
+ def sync_model_weights(self, actor_weights: Dict, load_format: str):
+ if load_format in [LoadFormat.MEGATRON, LoadFormat.AUTO]:
+ load_megatron_weights(actor_weights, self.model_runner.model)
+ elif load_format == LoadFormat.HF:
+ # full model state dict without no sharding
+ load_hf_weights(actor_weights, self.model_runner.model)
+ elif load_format == LoadFormat.DTENSOR:
+ load_dtensor_weights(actor_weights, self.model_runner.model)
+
+ def offload_model_weights(self) -> None:
+ if self.cpu_model == None:
+ self.cpu_model = {}
+ for name, params in self.model_runner.model.named_parameters():
+ self.cpu_model[name] = torch.empty_like(params, device="cpu")
+ params.data = self.cpu_model[name]
+ else:
+ for name, params in self.model_runner.model.named_parameters():
+ params.data = self.cpu_model[name]
+
+
+def init_worker_distributed_environment(
+ parallel_config: ParallelConfig,
+ rank: int,
+ distributed_init_method: Optional[str] = "env://",
+ local_rank: int = -1,
+) -> None:
+ """Initialize the distributed environment."""
+ set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+ # NOTE(sgm) use tcp://localhost:xxxx will hang in HF setting without megatron
+ init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank)
+
+ ensure_model_parallel_initialized(
+ tensor_model_parallel_size=parallel_config.tensor_parallel_size,
+ pipeline_model_parallel_size=parallel_config.pipeline_parallel_size,
+ )
+
+ # TODO(sgm): check whether need this
+ # if pynccl_utils.is_initialized():
+ # pynccl_world_size = pynccl_utils.get_world_size()
+ # if pynccl_world_size != parallel_config.world_size:
+ # raise RuntimeError(
+ # "pynccl is already initialized but the pynccl world "
+ # "size does not match parallel_config.world_size "
+ # f"({pynccl_world_size} vs. {parallel_config.world_size}).")
+ # elif parallel_config.world_size > 1:
+ # # NOTE(woosuk): We don't initialize pynccl process group when world size
+ # # is 1.
+ # # NOTE(kaichao): By default, pynccl is initialized for tp group.
+ # pynccl_utils.init_process_group(
+ # group=get_tensor_model_parallel_cpu_group())
+
+ # # Initialize a custom fast all-reduce implementation.
+ # if not parallel_config.disable_custom_all_reduce:
+ # init_custom_ar()
+
+ # A small all_reduce for warmup.
+ torch.distributed.all_reduce(torch.zeros(1).cuda())
+ # if pynccl_utils.is_initialized():
+ # pynccl_utils.all_reduce(torch.zeros(1).cuda())
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/__init__.py b/code/RL_model/verl/Search-R1/verl/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/config/evaluation.yaml b/code/RL_model/verl/Search-R1/verl/trainer/config/evaluation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d8ccff888f65e831ec702291b904a4a8a6f8a22
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/config/evaluation.yaml
@@ -0,0 +1,6 @@
+data:
+ path: /tmp/math_Qwen2-7B-Instruct.parquet
+ prompt_key: prompt
+ response_key: responses
+ data_source_key: data_source
+ reward_model_key: reward_model
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/config/generation.yaml b/code/RL_model/verl/Search-R1/verl/trainer/config/generation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed805a8c04949ff02d0a7de67a2cf78788217ced
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/config/generation.yaml
@@ -0,0 +1,35 @@
+trainer:
+ nnodes: 1
+ n_gpus_per_node: 8
+
+data:
+ path: ~/data/rlhf/math/test.parquet
+ prompt_key: prompt
+ n_samples: 5
+ output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
+ batch_size: 128
+
+model:
+ path: ~/models/Qwen2-7B-Instruct
+ external_lib: null
+rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: 50 # 0 for hf rollout, -1 for vllm rollout
+ top_p: 0.7
+ prompt_length: 1536
+ response_length: 512
+ # for vllm rollout
+ dtype: bfloat16 # should align with FSDP
+ gpu_memory_utilization: 0.5
+ ignore_eos: False
+ micro_batch_size: 256
+ enforce_eager: True
+ free_cache_engine: True
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 1
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 8
+ # for hf rollout
+ do_sample: True
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/config/ppo_megatron_trainer.yaml b/code/RL_model/verl/Search-R1/verl/trainer/config/ppo_megatron_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ae26851f38d32715789777b2af741c5da19cae2
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -0,0 +1,148 @@
+data:
+ tokenizer: null
+ train_files: ~/data/rlhf/gsm8k/train.parquet
+ val_files: ~/data/rlhf/gsm8k/test.parquet
+ prompt_key: prompt
+ max_prompt_length: 512
+ max_response_length: 512
+ train_batch_size: 1024
+ val_batch_size: 1312
+ return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
+ return_raw_chat: False
+
+actor_rollout_ref:
+ hybrid_engine: True
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ external_lib: null
+ override_config: {}
+ enable_gradient_checkpointing: False
+ actor:
+ strategy: megatron # This is for backward-compatibility
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: 64
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ ppo_epochs: 1
+ shuffle: True
+ optim:
+ lr: 1e-6
+ clip_grad: 1.0
+ lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
+ min_lr_ratio: null # only useful for warmup with cosine
+ warmup_style: constant # select from constant/cosine
+ total_training_steps: -1 # must be override by program
+ megatron:
+ tensor_model_parallel_size: 4
+ pipeline_model_parallel_size: 1
+ num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug.
+ sequence_parallel: True
+ seed: 1
+ load_weight: True
+ ref:
+ megatron:
+ tensor_model_parallel_size: 4
+ pipeline_model_parallel_size: 1
+ num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug.
+ sequence_parallel: True
+ seed: 1
+ load_weight: True
+ param_offload: False
+ log_prob_micro_batch_size: 32
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+ top_p: 1
+ prompt_length: ${data.max_prompt_length} # for xperf_gpt
+ response_length: ${data.max_response_length}
+ # for vllm rollout
+ dtype: bfloat16 # should align with FSDP
+ gpu_memory_utilization: 0.5
+ ignore_eos: False
+ enforce_eager: True
+ free_cache_engine: True
+ load_format: dummy_megatron
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 2
+ # for hf rollout
+ do_sample: True
+ layer_name_map:
+ qkv_layer_name: qkv
+ gate_proj_layer_name: gate_up
+ # number of responses (i.e. num sample times)
+ n: 1
+
+critic:
+ strategy: megatron
+ optim:
+ lr: 1e-5
+ clip_grad: 1.0
+ lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
+ min_lr_ratio: null # only useful for warmup with cosine
+ warmup_style: constant # select from constant/cosine
+ total_training_steps: -1 # must be override by program
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: {}
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: False
+ megatron:
+ tensor_model_parallel_size: 4
+ pipeline_model_parallel_size: 1
+ num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug.
+ sequence_parallel: True
+ seed: 1
+ load_weight: True
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 2
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ cliprange_value: 0.5
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+
+reward_model:
+ enable: False
+ strategy: megatron
+ megatron:
+ tensor_model_parallel_size: 4
+ pipeline_model_parallel_size: 1
+ num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug.
+ sequence_parallel: True
+ seed: 1
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ load_weight: True
+ param_offload: False
+ micro_batch_size: 64
+ max_length: null
+
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: gae
+ kl_penalty: kl # how to estimate kl divergence
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+
+trainer:
+ total_epochs: 30
+ total_training_steps: null
+ project_name: verl_examples
+ experiment_name: gsm8k
+ logger: ['console', 'wandb']
+ nnodes: 1
+ n_gpus_per_node: 8
+ save_freq: -1
+ test_freq: 2
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/config/ppo_trainer.yaml b/code/RL_model/verl/Search-R1/verl/trainer/config/ppo_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25452ac960964bf1170655701c9fd45a1a2fad5c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/config/ppo_trainer.yaml
@@ -0,0 +1,180 @@
+data:
+ tokenizer: null
+ train_files: ~/data/rlhf/gsm8k/train.parquet
+ val_files: ~/data/rlhf/gsm8k/test.parquet
+ train_data_num: null
+ val_data_num: null
+ prompt_key: prompt
+ max_prompt_length: 512
+ max_response_length: 512
+ max_start_length: 256
+ max_obs_length: 512
+ train_batch_size: 1024
+ val_batch_size: 1312
+ return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
+ return_raw_chat: False
+ shuffle_train_dataloader: True
+
+actor_rollout_ref:
+ hybrid_engine: True
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ external_lib: null
+ override_config: { }
+ enable_gradient_checkpointing: False
+ use_remove_padding: False
+ actor:
+ strategy: fsdp # This is for backward-compatibility
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: 64
+ use_dynamic_bsz: False
+ ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+ grad_clip: 1.0
+ state_masking: False
+ clip_ratio: 0.2
+ entropy_coeff: 0.001
+ use_kl_loss: False # True for GRPO
+ kl_loss_coef: 0.001 # for grpo
+ kl_loss_type: low_var_kl # for grpo
+ ppo_epochs: 1
+ shuffle: False
+ ulysses_sequence_parallel_size: 1 # sp size
+ optim:
+ lr: 1e-6
+ lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
+ min_lr_ratio: null # only useful for warmup with cosine
+ warmup_style: constant # select from constant/cosine
+ total_training_steps: -1 # must be override by program
+ fsdp_config:
+ wrap_policy:
+ # transformer_layer_cls_to_wrap: None
+ min_num_params: 0
+ param_offload: False
+ grad_offload: False
+ optimizer_offload: False
+ fsdp_size: -1
+ ref:
+ fsdp_config:
+ param_offload: False
+ wrap_policy:
+ # transformer_layer_cls_to_wrap: None
+ min_num_params: 0
+ fsdp_size: -1
+ log_prob_micro_batch_size: 128
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+ top_p: 0.95
+ prompt_length: ${data.max_prompt_length} # not use for opensource
+ response_length: ${data.max_response_length}
+ # for vllm rollout
+ dtype: bfloat16 # should align with FSDP
+ gpu_memory_utilization: 0.5
+ ignore_eos: False
+ enforce_eager: True
+ free_cache_engine: True
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: 128
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ # for hf rollout
+ do_sample: True
+ # number of responses (i.e. num sample times)
+ n: 1 # > 1 for grpo
+ n_agent: 1 # different here used for agent tasks only
+
+critic:
+ strategy: fsdp
+ optim:
+ lr: 1e-5
+ lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
+ min_lr_ratio: null # only useful for warmup with cosine
+ warmup_style: constant # select from constant/cosine
+ total_training_steps: -1 # must be override by program
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ tokenizer_path: ${actor_rollout_ref.model.path}
+ override_config: { }
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ enable_gradient_checkpointing: False
+ use_remove_padding: False
+ fsdp_config:
+ param_offload: False
+ grad_offload: False
+ optimizer_offload: False
+ wrap_policy:
+ # transformer_layer_cls_to_wrap: None
+ min_num_params: 0
+ fsdp_size: -1
+ ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+ ppo_micro_batch_size: 64
+ forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+ use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+ forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: 1 # sp size
+ ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+ shuffle: ${actor_rollout_ref.actor.shuffle}
+ grad_clip: 1.0
+ cliprange_value: 0.5
+
+reward_model:
+ enable: False
+ strategy: fsdp
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
+ path: ~/models/FsfairX-LLaMA3-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ use_remove_padding: False
+ fsdp_config:
+ min_num_params: 0
+ param_offload: False
+ micro_batch_size: 64
+ max_length: null
+ ulysses_sequence_parallel_size: 1 # sp size
+ use_dynamic_bsz: ${critic.use_dynamic_bsz}
+ forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+ structure_format_score: 0
+ final_format_score: 0
+ retrieval_score: 0
+
+retriever:
+ url: "http://127.0.0.1:8000/retrieve"
+ topk: 3
+
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: gae
+ no_think_rl: False
+ kl_penalty: kl # how to estimate kl divergence
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.001
+ state_masking:
+ start_state_marker: ""
+ end_state_marker: ""
+
+trainer:
+ total_epochs: 30
+ total_training_steps: null
+ project_name: verl_examples
+ experiment_name: gsm8k
+ logger: [ 'console', 'wandb' ]
+ nnodes: 1
+ n_gpus_per_node: 8
+ save_freq: -1
+ test_freq: -1
+ critic_warmup: 0
+ default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+max_turns: 10
+do_search: true
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/config/sft_trainer.yaml b/code/RL_model/verl/Search-R1/verl/trainer/config/sft_trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f2e9d865957dee7d7223b059bf9dff7c547e9e5
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/config/sft_trainer.yaml
@@ -0,0 +1,42 @@
+data:
+ train_batch_size: 256
+ micro_batch_size: 16 # this is also val batch size
+ train_files: ~/data/gsm8k/train.parquet
+ val_files: ~/data/gsm8k/test.parquet
+ prompt_key: question
+ response_key: answer
+ max_length: 1024
+ truncation: error
+ balance_dp_token: False
+ chat_template: null
+model:
+ partial_pretrain: ~/models/gemma-1.1-7b-it
+ fsdp_config:
+ wrap_policy:
+ min_num_params: 0
+ cpu_offload: False
+ offload_params: False
+ external_lib: null
+ enable_gradient_checkpointing: False
+ trust_remote_code: False
+ lora_rank: 0 # Set to positive value to enable LoRA (e.g., 32)
+ lora_alpha: 16 # LoRA scaling factor
+ target_modules: [q_proj, v_proj] # Target modules for LoRA adaptation
+optim:
+ lr: 1e-5
+ betas: [0.9, 0.95]
+ weight_decay: 0.01
+ warmup_steps_ratio: 0.1
+ clip_grad: 1.0
+
+trainer:
+ default_local_dir: /tmp/sft_model
+ default_hdfs_dir: hdfs://tmp/experiments/gsm8k/gemma-1.1-7b-it/ # change the hdfs path here
+ resume_path: null
+ project_name: gsm8k-sft
+ experiment_name: test
+ total_epochs: 4
+ total_training_steps: null
+ validate_before_training: False
+ logger: ['console']
+ seed: 1
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/fsdp_sft_trainer.py b/code/RL_model/verl/Search-R1/verl/trainer/fsdp_sft_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ccebf1ca661f11b64c7375f4ea4028f3a39fcc
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/fsdp_sft_trainer.py
@@ -0,0 +1,435 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A lightweight one-file FSDP SFT Trainer
+TODO(zhangchi.usc1992)
+- Add calculation of mfu
+- Add validation
+"""
+
+import os
+
+os.environ['NCCL_DEBUG'] = 'WARN'
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+
+import logging
+import re
+import torch
+import torch.distributed
+from torch import nn, optim
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, MixedPrecision, ShardingStrategy, CPUOffload
+from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, AutoConfig
+from verl.utils.torch_functional import get_cosine_schedule_with_warmup
+from tensordict import TensorDict
+from torch.utils.data import DataLoader, DistributedSampler
+
+from verl.utils.fsdp_utils import get_fsdp_wrap_policy, init_fn, get_init_weight_context_manager
+from verl.utils.dataset import SFTDataset
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils.tracking import Tracking
+
+from torch.distributed.device_mesh import DeviceMesh
+
+import verl.utils.hdfs_io as hdfs_io
+from verl.utils.debug import log_gpu_memory_usage
+from peft import LoraConfig, TaskType, get_peft_model
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_SFT_LOGGING_LEVEL', 'WARN'))
+
+
+def extract_step(path):
+ match = re.search(r'global_step_(\d+)', path)
+ if match:
+ return int(match.group(1))
+ return None
+
+
+def convert_to_regular_types(obj):
+ """Convert Hydra configs and other special types to regular Python types."""
+ from omegaconf import ListConfig, DictConfig
+ if isinstance(obj, (ListConfig, DictConfig)):
+ return {k: convert_to_regular_types(v) for k, v in obj.items()} if isinstance(obj, DictConfig) else list(obj)
+ elif isinstance(obj, (list, tuple)):
+ return [convert_to_regular_types(x) for x in obj]
+ elif isinstance(obj, dict):
+ return {k: convert_to_regular_types(v) for k, v in obj.items()}
+ return obj
+
+
+class FSDPSFTTrainer(object):
+
+ def __init__(self, config, device_mesh: DeviceMesh):
+ self.config = config
+ self.device_mesh = device_mesh
+ # build tokenizer first
+ local_model_path = copy_local_path_from_hdfs(src=self.config.model.partial_pretrain, verbose=True)
+ from verl.utils import hf_tokenizer
+ self.tokenizer = hf_tokenizer(local_model_path, trust_remote_code=self.config.model.trust_remote_code)
+ if self.config.data.chat_template is not None:
+ raise ValueError('Apply Chat template from config is not supported yet.')
+
+ # normalize dp size
+ self._normalize_config_bsz()
+
+ self._build_dataloader()
+ # build model
+ self._build_model_optimizer()
+
+ # TODO: add checkpoint manager
+ if self.device_mesh.get_rank() == 0:
+ print(self.config)
+
+ def _normalize_config_bsz(self):
+ dp_size = self.device_mesh.size()
+ if self.device_mesh.get_rank() == 0:
+ print(f'Normalize batch size by dp {dp_size}')
+
+ assert self.config.data.train_batch_size % dp_size == 0
+ assert self.config.data.micro_batch_size % dp_size == 0
+
+ self.config.data.train_batch_size //= dp_size
+ self.config.data.micro_batch_size //= dp_size
+
+ def _build_dataloader(self):
+ config = self.config
+ # build dataset
+ self.train_dataset = SFTDataset(parquet_files=config.data.train_files,
+ tokenizer=self.tokenizer,
+ prompt_key=config.data.prompt_key,
+ prompt_dict_keys=config.data.get('prompt_dict_keys', None),
+ response_key=config.data.response_key,
+ response_dict_keys=config.data.get('response_dict_keys', None),
+ max_length=config.data.max_length,
+ truncation=config.data.truncation)
+ self.val_dataset = SFTDataset(parquet_files=config.data.val_files,
+ tokenizer=self.tokenizer,
+ prompt_key=config.data.prompt_key,
+ prompt_dict_keys=config.data.get('prompt_dict_keys', None),
+ response_key=config.data.response_key,
+ response_dict_keys=config.data.get('response_dict_keys', None),
+ max_length=config.data.max_length,
+ truncation=config.data.truncation)
+
+ # build dataloader
+ rank = self.device_mesh.get_rank()
+ world_size = self.device_mesh.size()
+ self.train_sampler = DistributedSampler(self.train_dataset,
+ shuffle=True,
+ num_replicas=world_size,
+ rank=rank,
+ drop_last=True)
+ self.train_dataloader = DataLoader(dataset=self.train_dataset,
+ batch_size=config.data.train_batch_size,
+ sampler=self.train_sampler,
+ num_workers=8,
+ pin_memory=True,
+ drop_last=True)
+
+ self.val_sampler = DistributedSampler(self.val_dataset,
+ shuffle=True,
+ num_replicas=world_size,
+ rank=rank,
+ drop_last=True)
+ self.val_dataloader = DataLoader(dataset=self.val_dataset,
+ batch_size=config.data.micro_batch_size,
+ sampler=self.val_sampler,
+ num_workers=8,
+ pin_memory=True,
+ drop_last=True)
+
+ def _build_model_optimizer(self):
+ # TODO (zhangchi.usc1992):
+ # 1. support pretrain from random weights
+ # 2. support init directly from sharded weights
+ local_model_path = copy_local_path_from_hdfs(src=self.config.model.partial_pretrain, verbose=True)
+
+ if self.config.model.get('external_lib', None) is not None:
+ # This is used to import external_lib into the huggingface systems
+ import importlib
+ importlib.import_module(self.config.model.external_lib)
+
+ log_gpu_memory_usage('Before model allocation', logger=logger)
+
+ trust_remote_code = self.config.model.trust_remote_code
+ # load config first
+ config = AutoConfig.from_pretrained(local_model_path, trust_remote_code=trust_remote_code)
+
+ # This may be very large
+ init_context = get_init_weight_context_manager(use_meta_tensor=not config.tie_word_embeddings)
+
+ with init_context():
+ self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(local_model_path,
+ config=config,
+ torch_dtype=torch.float32,
+ attn_implementation='flash_attention_2',
+ trust_remote_code=trust_remote_code)
+ if self.config.model.get('lora_rank', 0) > 0:
+ self.model.enable_input_require_grads()
+ # Convert config to regular Python types before creating PEFT model
+ lora_config = {
+ 'task_type': TaskType.CAUSAL_LM,
+ 'r': self.config.model.lora_rank,
+ 'lora_alpha': self.config.model.lora_alpha,
+ 'target_modules': convert_to_regular_types(self.config.model.target_modules),
+ 'bias': "none"
+ }
+ self.model = get_peft_model(self.model, LoraConfig(**lora_config))
+
+ if self.config.model.enable_gradient_checkpointing:
+ self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})
+
+ log_gpu_memory_usage('After model allocation', logger=logger)
+
+ mixed_precision = MixedPrecision(param_dtype=torch.bfloat16,
+ reduce_dtype=torch.float32,
+ buffer_dtype=torch.float32)
+
+ auto_wrap_policy = get_fsdp_wrap_policy(self.model,
+ config=self.config.model.fsdp_config.wrap_policy,
+ is_lora=self.config.model.get('lora_rank', 0) > 0)
+ if self.device_mesh.get_rank() == 0:
+ print(auto_wrap_policy)
+
+ if not self.config.model.fsdp_config.cpu_offload:
+ cpu_offload = None
+ else:
+ cpu_offload = CPUOffload(offload_params=self.config.model.fsdp_config.offload_params)
+
+ self.fsdp_model = FSDP(module=self.model,
+ auto_wrap_policy=auto_wrap_policy,
+ param_init_fn=init_fn,
+ sharding_strategy=ShardingStrategy.FULL_SHARD,
+ mixed_precision=mixed_precision,
+ device_mesh=self.device_mesh,
+ sync_module_states=True,
+ device_id=torch.cuda.current_device(),
+ cpu_offload=cpu_offload,
+ use_orig_params=False)
+
+ log_gpu_memory_usage('After FSDP wrapping', logger=logger)
+
+ self.optimizer = optim.AdamW(self.fsdp_model.parameters(),
+ lr=self.config.optim.lr,
+ betas=self.config.optim.betas,
+ weight_decay=self.config.optim.weight_decay)
+
+ log_gpu_memory_usage('After initialize optimizer', logger=logger)
+
+ steps_per_epoch = len(self.train_dataloader)
+ total_steps = steps_per_epoch * self.config.trainer.total_epochs
+
+ if self.device_mesh.get_rank() == 0:
+ print(
+ f'Number of steps/epoch {steps_per_epoch}, number of epochs {self.config.trainer.total_epochs}, total number of steps {total_steps}'
+ )
+
+ num_warmup_steps = int(total_steps * self.config.optim.warmup_steps_ratio)
+
+ self.lr_scheduler = get_cosine_schedule_with_warmup(optimizer=self.optimizer,
+ num_warmup_steps=num_warmup_steps,
+ num_training_steps=total_steps)
+
+ def _compute_loss(self, batch):
+ loss_mask = batch.pop('loss_mask')[:, :-1].reshape(-1).cuda()
+ labels = batch['input_ids'][:, 1:].cuda()
+
+ with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+ output = self.fsdp_model(input_ids=batch['input_ids'],
+ attention_mask=batch['attention_mask'],
+ position_ids=batch['position_ids'],
+ use_cache=False) # prevent model thinks it it generating
+
+ logits = output.logits
+
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels.contiguous()
+ # Flatten the tokens
+ loss_fct = nn.CrossEntropyLoss(reduction='none')
+ shift_logits = shift_logits.view(-1, self.model.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+ loss = loss * loss_mask
+
+ valid_token_this_rank = torch.sum(loss_mask)
+
+ if self.config.data.balance_dp_token:
+ torch.distributed.all_reduce(valid_token_this_rank) # becomes total valid tokens in all ranks
+ dp_size = torch.distributed.get_world_size()
+ else:
+ dp_size = 1
+
+ loss = torch.sum(loss) / valid_token_this_rank * dp_size # possible bugs here for dp
+ return loss
+
+ def training_step(self, batch: TensorDict):
+ self.fsdp_model.train()
+
+ log_gpu_memory_usage('Before optimizer zero_grad', logger=logger)
+
+ self.optimizer.zero_grad()
+
+ log_gpu_memory_usage('After optimizer zero_grad', logger=logger)
+
+ micro_batches = batch.split(self.config.data.micro_batch_size)
+ n_micro_batches = len(micro_batches)
+ step_loss = 0
+ for micro_batch in micro_batches:
+ loss = self._compute_loss(batch=micro_batch) / n_micro_batches
+ loss.backward()
+ step_loss += loss.item()
+
+ self.fsdp_model.clip_grad_norm_(max_norm=self.config.optim.clip_grad)
+
+ log_gpu_memory_usage('Before optimizer step', logger=logger)
+
+ self.optimizer.step()
+
+ log_gpu_memory_usage('After optimizer step', logger=logger)
+
+ self.lr_scheduler.step()
+
+ # reduce loss across dp ranks
+ lr = self.lr_scheduler.get_last_lr()[0]
+
+ log_gpu_memory_usage('After offload weights', logger=logger)
+
+ step_loss = torch.tensor(step_loss).cuda()
+ torch.distributed.all_reduce(step_loss, op=torch.distributed.ReduceOp.AVG)
+ return {'train/loss': step_loss.detach().item(), 'train/lr(1e-3)': lr * 1e3}
+
+ def validation_step(self, batch: TensorDict):
+ self.fsdp_model.eval()
+ with torch.no_grad():
+ loss = self._compute_loss(batch)
+ torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.AVG)
+ return loss
+
+ def save_checkpoint(self, step):
+ # save checkpoint
+ from torch.distributed.fsdp import FullStateDictConfig, StateDictType
+ cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+ with FSDP.state_dict_type(self.fsdp_model, StateDictType.FULL_STATE_DICT, cfg):
+ state_dict = self.fsdp_model.state_dict()
+
+ path = os.path.join(self.config.trainer.default_local_dir, f'global_step_{step}')
+ # save huggingface model
+ if self.device_mesh.get_rank() == 0:
+ os.makedirs(path, exist_ok=True)
+ self.model.save_pretrained(path, state_dict=state_dict)
+ self.tokenizer.save_pretrained(path)
+ if self.config.trainer.default_hdfs_dir:
+ hdfs_io.makedirs(self.config.trainer.default_hdfs_dir, exist_ok=True)
+ hdfs_io.copy(src=path, dst=self.config.trainer.default_hdfs_dir, dirs_exist_ok=True)
+ torch.distributed.barrier()
+
+ def fit(self):
+ rank = self.device_mesh.get_rank()
+
+ # TODO: add a unified tracking
+ if rank == 0:
+ tracking = Tracking(project_name=self.config.trainer.project_name,
+ experiment_name=self.config.trainer.experiment_name,
+ default_backend=self.config.trainer.logger)
+
+ global_step = 0
+ # compute the total training steps.
+ # the total training steps in SFT is mainly for early exit
+ total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+ if self.config.trainer.total_training_steps is not None:
+ total_training_steps = self.config.trainer.total_training_steps
+
+ self.total_training_steps = total_training_steps
+ print(f'Total training steps: {self.total_training_steps}')
+
+ # TODO (zhangchi.usc1992) add back checkpoint manager. Currently, it blocks when uploading to hdfs. So very slow.
+
+ if self.config.trainer.validate_before_training:
+ # validate before training
+ val_losses = []
+ for data in self.val_dataloader:
+ data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda()
+ val_loss = self.validation_step(data)
+ val_losses.append(val_loss)
+ if rank == 0:
+ val_loss = torch.mean(torch.stack(val_losses))
+ metric = {'val/loss': val_loss.detach().item()}
+ tracking.log(data=metric, step=global_step)
+ torch.distributed.barrier()
+
+ for epoch in range(self.config.trainer.total_epochs):
+ self.train_sampler.set_epoch(epoch=epoch)
+ for data in self.train_dataloader:
+ data = TensorDict(data, batch_size=self.config.data.train_batch_size).cuda()
+ metric = self.training_step(data)
+ if rank == 0:
+ tracking.log(data=metric, step=global_step)
+ global_step += 1
+
+ # for early exit validation
+ if global_step >= self.total_training_steps:
+ # Perform final validation
+ val_losses = []
+ for val_data in self.val_dataloader:
+ val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size).cuda()
+ val_loss = self.validation_step(val_data)
+ val_losses.append(val_loss)
+ if rank == 0:
+ avg_val_loss = torch.mean(torch.stack(val_losses))
+ metric = {'val/loss': avg_val_loss.detach().item()}
+ tracking.log(data=metric, step=global_step)
+ torch.distributed.barrier()
+
+ # Save final checkpoint
+ self.save_checkpoint(step=global_step)
+ return
+
+ # validation
+ val_losses = []
+ for data in self.val_dataloader:
+ data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda()
+ val_loss = self.validation_step(data)
+ val_losses.append(val_loss)
+ if rank == 0:
+ val_loss = torch.mean(torch.stack(val_losses))
+ metric = {'val/loss': val_loss.detach().item()}
+ tracking.log(data=metric, step=global_step)
+ torch.distributed.barrier()
+
+ # save checkpoint
+ self.save_checkpoint(step=global_step)
+
+
+from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer
+import hydra
+
+from torch.distributed.device_mesh import init_device_mesh
+
+from verl.utils.distributed import initialize_global_process_group
+
+
+@hydra.main(config_path='config', config_name='sft_trainer', version_base=None)
+def main(config):
+ local_rank, rank, world_size = initialize_global_process_group()
+
+ device_mesh = init_device_mesh(device_type='cuda', mesh_shape=(world_size,), mesh_dim_names=('dp',))
+ trainer = FSDPSFTTrainer(config=config, device_mesh=device_mesh)
+ trainer.fit()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py b/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..018bdd8fdbe01dddda5da009694246021320ab44
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/main_eval.py
@@ -0,0 +1,69 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Offline evaluate the performance of a generated file using reward model and ground truth verifier.
+The input is a parquet file that contains N generated sequences and (optional) the ground truth.
+
+"""
+
+import hydra
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils.reward_score import math, gsm8k
+import pandas as pd
+import numpy as np
+
+
+def select_reward_fn(data_source):
+ if data_source == 'lighteval/MATH':
+ return math.compute_score
+ else:
+ raise NotImplementedError
+
+
+@hydra.main(config_path='config', config_name='evaluation', version_base=None)
+def main(config):
+ local_path = copy_local_path_from_hdfs(config.data.path)
+ dataset = pd.read_parquet(local_path)
+ prompts = dataset[config.data.prompt_key]
+ responses = dataset[config.data.response_key]
+ data_sources = dataset[config.data.data_source_key]
+ reward_model_data = dataset[config.data.reward_model_key]
+
+ passes = 0
+
+ total = len(dataset)
+
+ for i in range(total):
+ response_lst = responses[i]
+ data_source = data_sources[i]
+ # select reward score based on data_source
+ prompt = prompts[i]
+ reward_data = reward_model_data[i]
+ reward_fn = select_reward_fn(data_source)
+ ground_truth = reward_data['ground_truth']
+ score_lst = []
+ for r in response_lst:
+ score = reward_fn(r, ground_truth)
+ score_lst.append(score)
+
+ max_score = np.max(score_lst)
+
+ if max_score == 1:
+ passes += 1
+
+ print(f'pass@5: {passes / total}')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/main_generation.py b/code/RL_model/verl/Search-R1/verl/trainer/main_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c3bd923fc30b20b07ff831b75657a1e949b6e43
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/main_generation.py
@@ -0,0 +1,137 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Generate responses given a dataset of prompts
+"""
+import ray
+import numpy as np
+import hydra
+import os
+
+os.environ['NCCL_DEBUG'] = 'WARN'
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+# os.environ['TORCH_COMPILE_DISABLE'] = '1'
+
+from verl.utils.model import compute_position_id_with_mask
+
+import pandas as pd
+
+from transformers import AutoTokenizer
+
+from verl import DataProto
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.workers.fsdp_workers import ActorRolloutRefWorker
+from verl.utils.hdfs_io import makedirs
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+
+
+@hydra.main(config_path='config', config_name='generation', version_base=None)
+def main(config):
+ from pprint import pprint
+ from omegaconf import OmegaConf
+ pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
+ OmegaConf.resolve(config)
+ local_path = copy_local_path_from_hdfs(config.model.path)
+ from verl.utils import hf_tokenizer
+ tokenizer = hf_tokenizer(local_path)
+
+ if config.rollout.temperature == 0.:
+ assert config.data.n_samples == 1, 'When temperature=0, n_samples must be 1.'
+
+ # read dataset. Note that the dataset should directly contain chat template format (e.g., a list of dictionary)
+ dataset = pd.read_parquet(config.data.path)
+ chat_lst = dataset[config.data.prompt_key].tolist()
+
+ chat_lst = [chat.tolist() for chat in chat_lst]
+
+ tokenizer.padding_side = 'left'
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+
+ ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorRolloutRefWorker), config=config, role='rollout')
+ resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes)
+ wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
+ wg.init_model()
+
+ total_samples = len(dataset)
+ # real_batch_size = data.batch['input_ids'].shape[0]
+ config_batch_size = config.data.batch_size
+ dp_size = wg.world_size // config.rollout.tensor_model_parallel_size
+ num_batch = (total_samples // config_batch_size) + 1
+ output_lst = [[] for _ in range(config.data.n_samples)]
+
+ for batch_idx in range(num_batch):
+ print(f'[{batch_idx+1}/{num_batch}] Start to process.')
+ batch_chat_lst = chat_lst[batch_idx * config_batch_size:(batch_idx + 1) * config_batch_size]
+ inputs = tokenizer.apply_chat_template(batch_chat_lst,
+ add_generation_prompt=True,
+ padding=True,
+ truncation=True,
+ max_length=config.rollout.prompt_length,
+ return_tensors='pt',
+ return_dict=True,
+ tokenize=True)
+ input_ids = inputs['input_ids']
+ attention_mask = inputs['attention_mask']
+ position_ids = compute_position_id_with_mask(attention_mask)
+
+ batch_dict = {'input_ids': input_ids, 'attention_mask': attention_mask, 'position_ids': position_ids}
+
+ data = DataProto.from_dict(batch_dict)
+ real_batch_size = data.batch['input_ids'].shape[0]
+ if real_batch_size % dp_size != 0:
+ dummy_data_size = dp_size - real_batch_size % dp_size
+ dummy_data = data[:dummy_data_size]
+ data = DataProto.concat([data, dummy_data])
+ print(
+ f'dp_size {dp_size} is not divisible by real_batch_size {real_batch_size}, add {dummy_data_size} dummy data'
+ )
+
+ batch_size = data.batch['input_ids'].shape[0]
+ assert batch_size % dp_size == 0, f'batch_size {batch_size} is not divisible by dp_size {dp_size}'
+
+ print(f'[{batch_idx+1}/{num_batch}] Start to generate.')
+ # START TO GENERATE FOR n_samples TIMES
+ for i in range(config.data.n_samples):
+ output = wg.generate_sequences(data)
+ # remove dummy data
+ output = output[:real_batch_size]
+ output_text = tokenizer.batch_decode(output.batch['input_ids'][:, -config.rollout.response_length:],
+ skip_special_tokens=False)
+
+ # remove the padding
+ pad_token = tokenizer.pad_token
+ output_text_unpad = []
+ for text in output_text:
+ output_text_unpad.append(text.replace(pad_token, ''))
+
+ output_lst[i].extend(output_text_unpad)
+
+ # convert output_lst from (n_samples, n_data) to (n_data, n_sampels)
+ output_lst = np.array(output_lst, dtype=object)
+ output_lst = np.transpose(output_lst, axes=(1, 0)).tolist()
+
+ # add to the data frame
+ dataset[f'responses'] = output_lst
+
+ # write to a new parquet
+ output_dir = os.path.dirname(config.data.output_path)
+ makedirs(output_dir, exist_ok=True)
+ dataset.to_parquet(config.data.output_path)
+
+ return output_text
+
+
+if __name__ == '__main__':
+ main()
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/main_ppo.py b/code/RL_model/verl/Search-R1/verl/trainer/main_ppo.py
new file mode 100644
index 0000000000000000000000000000000000000000..583c71b13428970b99575c467f3aeee4b8f97e50
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/main_ppo.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+
+from verl import DataProto
+import torch
+from verl.utils.reward_score import qa_em
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+import re
+import numpy as np
+
+def _select_rm_score_fn(data_source):
+ if data_source in ['nq', 'triviaqa', 'popqa', 'hotpotqa', '2wikimultihopqa', 'musique', 'bamboogle', 'multiclinsum']:
+ return qa_em.compute_score
+ else:
+ raise NotImplementedError
+
+
+class RewardManager():
+ """The reward manager.
+ """
+
+ def __init__(self, tokenizer, num_examine, format_score=0.) -> None:
+ self.tokenizer = tokenizer
+ self.num_examine = num_examine # the number of batches of decoded responses to print to the console
+ self.format_score = format_score
+
+ def __call__(self, data: DataProto):
+ """We will expand this function gradually based on the available datasets"""
+
+ # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
+ if 'rm_scores' in data.batch.keys():
+ return data.batch['rm_scores']
+
+ reward_tensor = torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+
+ # all_scores = []
+
+ already_print_data_sources = {}
+
+ for i in range(len(data)):
+ data_item = data[i] # DataProtoItem
+
+ prompt_ids = data_item.batch['prompts']
+
+ prompt_length = prompt_ids.shape[-1]
+
+ valid_prompt_length = data_item.batch['attention_mask'][:prompt_length].sum()
+ valid_prompt_ids = prompt_ids[-valid_prompt_length:]
+
+ response_ids = data_item.batch['responses']
+ valid_response_length = data_item.batch['attention_mask'][prompt_length:].sum()
+ valid_response_ids = response_ids[:valid_response_length]
+
+ # decode
+ sequences = torch.cat((valid_prompt_ids, valid_response_ids))
+ sequences_str = self.tokenizer.decode(sequences)
+
+ ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth']
+
+ # select rm_score
+ data_source = data_item.non_tensor_batch['data_source']
+ compute_score_fn = _select_rm_score_fn(data_source)
+
+ score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth, format_score=self.format_score)
+
+ reward_tensor[i, valid_response_length - 1] = score
+ # all_scores.append(score)
+
+ if data_source not in already_print_data_sources:
+ already_print_data_sources[data_source] = 0
+
+ if already_print_data_sources[data_source] < self.num_examine:
+ already_print_data_sources[data_source] += 1
+ print(sequences_str)
+
+ # print(f"[DEBUG] all_scores: {all_scores}")
+ # print(f"[DEBUG] all_scores shape: {np.array(all_scores).shape}")
+ # print(f"[DEBUG] all_scores mean: {np.mean(all_scores)}")
+ # print(f"[DEBUG] all_scores max: {np.max(all_scores)}")
+ # print(f"[DEBUG] all_scores min: {np.min(all_scores)}")
+ # print(f"[DEBUG] all_scores std: {np.std(all_scores)}")
+
+ return reward_tensor
+
+
+import ray
+import hydra
+
+
+@hydra.main(config_path='config', config_name='ppo_trainer', version_base=None)
+def main(config):
+ if not ray.is_initialized():
+ # this is for local ray cluster
+ ray.init(runtime_env={'env_vars': {'TOKENIZERS_PARALLELISM': 'true', 'NCCL_DEBUG': 'WARN'}})
+
+ ray.get(main_task.remote(config))
+
+
+@ray.remote
+def main_task(config):
+ from verl.utils.fs import copy_local_path_from_hdfs
+ from transformers import AutoTokenizer
+
+ # print initial config
+ from pprint import pprint
+ from omegaconf import OmegaConf
+ pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
+ OmegaConf.resolve(config)
+
+ # env_class = ENV_CLASS_MAPPING[config.env.name]
+
+ # download the checkpoint from hdfs
+ local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
+
+ # instantiate tokenizer
+ from verl.utils import hf_tokenizer
+ tokenizer = hf_tokenizer(local_path)
+
+ # define worker classes
+ if config.actor_rollout_ref.actor.strategy == 'fsdp':
+ assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+ from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
+ from verl.single_controller.ray import RayWorkerGroup
+ ray_worker_group_cls = RayWorkerGroup
+
+ elif config.actor_rollout_ref.actor.strategy == 'megatron':
+ assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+ from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
+ from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+ ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+ else:
+ raise NotImplementedError
+
+ from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+ role_worker_mapping = {
+ Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
+ Role.Critic: ray.remote(CriticWorker),
+ Role.RefPolicy: ray.remote(ActorRolloutRefWorker),
+ }
+
+ global_pool_id = 'global_pool'
+ resource_pool_spec = {
+ global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+ }
+ mapping = {
+ Role.ActorRollout: global_pool_id,
+ Role.Critic: global_pool_id,
+ Role.RefPolicy: global_pool_id,
+ }
+
+ # we should adopt a multi-source reward function here
+ # - for rule-based rm, we directly call a reward score
+ # - for model-based rm, we call a model
+ # - for code related prompt, we send to a sandbox if there are test cases
+ # - finally, we combine all the rewards together
+ # - The reward type depends on the tag of the data
+ if config.reward_model.enable:
+ if config.reward_model.strategy == 'fsdp':
+ from verl.workers.fsdp_workers import RewardModelWorker
+ elif config.reward_model.strategy == 'megatron':
+ from verl.workers.megatron_workers import RewardModelWorker
+ else:
+ raise NotImplementedError
+ role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+ mapping[Role.RewardModel] = global_pool_id
+
+ reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0)
+
+ # Note that we always use function-based RM for validation
+ val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
+
+ resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+ trainer = RayPPOTrainer(config=config,
+ tokenizer=tokenizer,
+ role_worker_mapping=role_worker_mapping,
+ resource_pool_manager=resource_pool_manager,
+ ray_worker_group_cls=ray_worker_group_cls,
+ reward_fn=reward_fn,
+ val_reward_fn=val_reward_fn,
+ )
+ trainer.init_workers()
+ trainer.fit()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/main_ppo_format.py b/code/RL_model/verl/Search-R1/verl/trainer/main_ppo_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..6620b8e032d9fb5781d8bfbbc0bddfd651c41937
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/main_ppo_format.py
@@ -0,0 +1,205 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Note that we don't combine the main with ray_trainer as ray_trainer is used by other main.
+"""
+
+from verl import DataProto
+import torch
+from verl.utils.reward_score import qa_em, qa_em_format
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer
+import re
+import numpy as np
+
+def _select_rm_score_fn(data_source):
+ if data_source in ['nq', 'triviaqa', 'popqa', 'web_questions', 'hotpotqa', '2wikimultihopqa', 'musique', 'bamboogle', 'strategyqa']:
+ return qa_em_format.compute_score_em
+ else:
+ raise NotImplementedError
+
+
+class RewardManager():
+ """The reward manager.
+ """
+
+ def __init__(self, tokenizer, num_examine, structure_format_score=0., final_format_score=0., retrieval_score=0., format_score=0.) -> None:
+ self.tokenizer = tokenizer
+ self.num_examine = num_examine # the number of batches of decoded responses to print to the console
+ self.format_score = format_score
+ self.structure_format_score = structure_format_score
+ self.final_format_score = final_format_score
+ self.retrieval_score = retrieval_score
+
+ def __call__(self, data: DataProto):
+ """We will expand this function gradually based on the available datasets"""
+
+ # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
+ if 'rm_scores' in data.batch.keys():
+ return data.batch['rm_scores']
+
+ reward_tensor = torch.zeros_like(data.batch['responses'], dtype=torch.float32)
+
+ # all_scores = []
+
+ already_print_data_sources = {}
+
+ for i in range(len(data)):
+ data_item = data[i] # DataProtoItem
+
+ prompt_ids = data_item.batch['prompts']
+
+ prompt_length = prompt_ids.shape[-1]
+
+ valid_prompt_length = data_item.batch['attention_mask'][:prompt_length].sum()
+ valid_prompt_ids = prompt_ids[-valid_prompt_length:]
+
+ response_ids = data_item.batch['responses']
+ valid_response_length = data_item.batch['attention_mask'][prompt_length:].sum()
+ valid_response_ids = response_ids[:valid_response_length]
+
+ # decode
+ sequences = torch.cat((valid_prompt_ids, valid_response_ids))
+ sequences_str = self.tokenizer.decode(sequences)
+
+ ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth']
+
+ # select rm_score
+ data_source = data_item.non_tensor_batch['data_source']
+ compute_score_fn = _select_rm_score_fn(data_source)
+
+ score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth,
+ structure_format_score=self.structure_format_score,
+ final_format_score=self.final_format_score,
+ retrieval_score=self.retrieval_score,
+ format_score=self.format_score)
+
+ reward_tensor[i, valid_response_length - 1] = score
+ # all_scores.append(score)
+
+ if data_source not in already_print_data_sources:
+ already_print_data_sources[data_source] = 0
+
+ if already_print_data_sources[data_source] < self.num_examine:
+ already_print_data_sources[data_source] += 1
+ print(sequences_str)
+
+ return reward_tensor
+
+
+import ray
+import hydra
+
+
+@hydra.main(config_path='config', config_name='ppo_trainer', version_base=None)
+def main(config):
+ if not ray.is_initialized():
+ # this is for local ray cluster
+ ray.init(runtime_env={'env_vars': {'TOKENIZERS_PARALLELISM': 'true', 'NCCL_DEBUG': 'WARN'}})
+
+ ray.get(main_task.remote(config))
+
+
+@ray.remote
+def main_task(config):
+ from verl.utils.fs import copy_local_path_from_hdfs
+ from transformers import AutoTokenizer
+
+ # print initial config
+ from pprint import pprint
+ from omegaconf import OmegaConf
+ pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
+ OmegaConf.resolve(config)
+
+ # env_class = ENV_CLASS_MAPPING[config.env.name]
+
+ # download the checkpoint from hdfs
+ local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
+
+ # instantiate tokenizer
+ from verl.utils import hf_tokenizer
+ tokenizer = hf_tokenizer(local_path)
+
+ # define worker classes
+ if config.actor_rollout_ref.actor.strategy == 'fsdp':
+ assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+ from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
+ from verl.single_controller.ray import RayWorkerGroup
+ ray_worker_group_cls = RayWorkerGroup
+
+ elif config.actor_rollout_ref.actor.strategy == 'megatron':
+ assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+ from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
+ from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+ ray_worker_group_cls = NVMegatronRayWorkerGroup
+
+ else:
+ raise NotImplementedError
+
+ from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+ role_worker_mapping = {
+ Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
+ Role.Critic: ray.remote(CriticWorker),
+ Role.RefPolicy: ray.remote(ActorRolloutRefWorker),
+ }
+
+ global_pool_id = 'global_pool'
+ resource_pool_spec = {
+ global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+ }
+ mapping = {
+ Role.ActorRollout: global_pool_id,
+ Role.Critic: global_pool_id,
+ Role.RefPolicy: global_pool_id,
+ }
+
+ # we should adopt a multi-source reward function here
+ # - for rule-based rm, we directly call a reward score
+ # - for model-based rm, we call a model
+ # - for code related prompt, we send to a sandbox if there are test cases
+ # - finally, we combine all the rewards together
+ # - The reward type depends on the tag of the data
+ if config.reward_model.enable:
+ if config.reward_model.strategy == 'fsdp':
+ from verl.workers.fsdp_workers import RewardModelWorker
+ elif config.reward_model.strategy == 'megatron':
+ from verl.workers.megatron_workers import RewardModelWorker
+ else:
+ raise NotImplementedError
+ role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+ mapping[Role.RewardModel] = global_pool_id
+
+ reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0,
+ structure_format_score=config.reward_model.structure_format_score,
+ final_format_score=config.reward_model.final_format_score,
+ retrieval_score=config.reward_model.retrieval_score)
+
+ # Note that we always use function-based RM for validation
+ val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
+
+ resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+ trainer = RayPPOTrainer(config=config,
+ tokenizer=tokenizer,
+ role_worker_mapping=role_worker_mapping,
+ resource_pool_manager=resource_pool_manager,
+ ray_worker_group_cls=ray_worker_group_cls,
+ reward_fn=reward_fn,
+ val_reward_fn=val_reward_fn,
+ )
+ trainer.init_workers()
+ trainer.fit()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/ppo/__init__.py b/code/RL_model/verl/Search-R1/verl/trainer/ppo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/ppo/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/ppo/core_algos.py b/code/RL_model/verl/Search-R1/verl/trainer/ppo/core_algos.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3f4aff3034d5b4c202d04582e2f04eed6e7cfec
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/ppo/core_algos.py
@@ -0,0 +1,274 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Core functions to implement PPO algorithms.
+The function implemented in this file should be used by trainer with different distributed strategies to
+implement PPO
+"""
+
+import numpy as np
+import torch
+from collections import defaultdict
+
+import verl.utils.torch_functional as verl_F
+
+
+class AdaptiveKLController:
+ """
+ Adaptive KL controller described in the paper:
+ https://arxiv.org/pdf/1909.08593.pdf
+ """
+
+ def __init__(self, init_kl_coef, target_kl, horizon):
+ self.value = init_kl_coef
+ self.target = target_kl
+ self.horizon = horizon
+
+ def update(self, current_kl, n_steps):
+ target = self.target
+ proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)
+ mult = 1 + proportional_error * n_steps / self.horizon
+ self.value *= mult
+
+
+class FixedKLController:
+ """Fixed KL controller."""
+
+ def __init__(self, kl_coef):
+ self.value = kl_coef
+
+ def update(self, current_kl, n_steps):
+ pass
+
+
+def get_kl_controller(config): # seems never used?
+ if config.critic.kl_ctrl.type == 'fixed':
+ kl_ctrl = FixedKLController(kl_coef=config.critic.kl_ctrl.kl_coef)
+ elif config.critic.kl_ctrl.type == 'adaptive':
+ assert config.kl_ctrl.horizon > 0, f'horizon must be larger than 0. Got {config.critic.kl_ctrl.horizon}'
+ kl_ctrl = AdaptiveKLController(init_kl_coef=config.critic.kl_ctrl.kl_coef,
+ target_kl=config.critic.kl_ctrl.target_kl,
+ horizon=config.critic.kl_ctrl.horizon)
+ else:
+ raise ValueError('Unknown kl_ctrl type')
+
+ return kl_ctrl
+
+
+def compute_gae_advantage_return(token_level_rewards: torch.Tensor, values: torch.Tensor, eos_mask: torch.Tensor,
+ gamma: torch.Tensor, lam: torch.Tensor):
+ """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py
+
+ Args:
+ token_level_rewards: `(torch.Tensor)`
+ shape: (bs, response_length)
+ values: `(torch.Tensor)`
+ shape: (bs, response_length)
+ eos_mask: `(torch.Tensor)`
+ shape: (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.
+ gamma: `(float)`
+ discounted factor used in RL
+ lam: `(float)`
+ lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
+
+ Returns:
+ advantages: `(torch.Tensor)`
+ shape: (bs, response_length)
+ Returns: `(torch.Tensor)`
+ shape: (bs, response_length)
+
+ """
+ with torch.no_grad():
+ lastgaelam = 0
+ advantages_reversed = []
+ gen_len = token_level_rewards.shape[-1]
+
+ for t in reversed(range(gen_len)):
+ nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
+ delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]
+ lastgaelam = delta + gamma * lam * lastgaelam
+ advantages_reversed.append(lastgaelam)
+ advantages = torch.stack(advantages_reversed[::-1], dim=1)
+
+ returns = advantages + values
+ advantages = verl_F.masked_whiten(advantages, eos_mask)
+ return advantages, returns
+
+
+# NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar.
+def compute_grpo_outcome_advantage(token_level_rewards: torch.Tensor,
+ eos_mask: torch.Tensor,
+ index: torch.Tensor,
+ epsilon: float = 1e-6):
+ """
+ Compute advantage for GRPO, operating only on Outcome reward
+ (with only one scalar reward for each response).
+ Args:
+ token_level_rewards: `(torch.Tensor)`
+ shape: (bs, response_length)
+ eos_mask: `(torch.Tensor)`
+ shape: (bs, response_length)
+
+ Returns:
+ advantages: `(torch.Tensor)`
+ shape: (bs, response_length)
+ Returns: `(torch.Tensor)`
+ shape: (bs, response_length)
+ """
+ response_length = token_level_rewards.shape[-1]
+ non_zero_mask = (token_level_rewards != 0)
+ scores = (token_level_rewards * non_zero_mask).sum(dim=-1)
+
+ id2score = defaultdict(list)
+ id2mean = {}
+ id2std = {}
+
+ with torch.no_grad():
+ bsz = scores.shape[0]
+ for i in range(bsz):
+ id2score[index[i]].append(scores[i])
+ for idx in id2score:
+ if len(id2score[idx]) == 1:
+ id2mean[idx] = torch.tensor(0.0)
+ id2std[idx] = torch.tensor(1.0)
+ elif len(id2score[idx]) > 1:
+ id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
+ id2std[idx] = torch.std(torch.tensor([id2score[idx]]))
+ else:
+ raise ValueError(f"no score in prompt index: {idx}")
+ for i in range(bsz):
+ scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon)
+ scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask
+
+ return scores, scores
+
+
+def compute_rewards(token_level_scores, old_log_prob, ref_log_prob, kl_ratio):
+ kl = old_log_prob - ref_log_prob
+ return token_level_scores - kl * kl_ratio
+
+
+def compute_policy_loss(old_log_prob, log_prob, advantages, eos_mask, cliprange):
+ """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122
+
+ Args:
+ old_log_prob: `(torch.Tensor)`
+ shape: (bs, response_length)
+ log_prob: `(torch.Tensor)`
+ shape: (bs, response_length)
+ advantages: `(torch.Tensor)`
+ shape: (bs, response_length)
+ eos_mask: `(torch.Tensor)`
+ shape: (bs, response_length)
+ cliprange: (float)
+ The clip range used in PPO. See https://arxiv.org/abs/1707.06347
+
+ Returns:
+ pg_loss: `a scalar torch.Tensor`
+ policy gradient loss computed via PPO
+ pg_clipfrac: (float)
+ a float number indicating the fraction of policy gradient loss being clipped
+
+ """
+ negative_approx_kl = log_prob - old_log_prob
+ ratio = torch.exp(negative_approx_kl)
+ ppo_kl = verl_F.masked_mean(-negative_approx_kl, eos_mask)
+
+ pg_losses = -advantages * ratio
+ pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange, 1.0 + cliprange)
+
+ pg_loss = verl_F.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask)
+ pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask)
+ return pg_loss, pg_clipfrac, ppo_kl
+
+
+def compute_entropy_loss(logits, eos_mask):
+ """Compute Categorical entropy loss
+
+ Args:
+ logits: `(torch.Tensor)`
+ shape: (bs, response_length, vocab_size)
+ eos_mask: `(torch.Tensor)`
+ shape: (bs, response_length)
+
+ Returns:
+ entropy: a scalar torch.Tensor
+
+ """
+ # compute entropy
+ entropy = verl_F.entropy_from_logits(logits) # (bs, response_len)
+ entropy_loss = verl_F.masked_mean(entropy, mask=eos_mask)
+ return entropy_loss
+
+
+def compute_value_loss(vpreds, returns, values, eos_mask, cliprange_value):
+ """Compute the value loss. Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1151
+
+ Args:
+ vpreds (`torch.FloatTensor`):
+ Predicted values of the value head, shape (`batch_size`, `response_length`)
+ values (`torch.FloatTensor`):
+ Old values of value head, shape (`batch_size`, `response_length`)
+ returns: (`torch.FloatTensor`):
+ Ground truth returns, shape (`batch_size`, `response_length`)
+
+ Returns:
+ vf_loss: a scalar (`torch.FloatTensor`):
+ value function loss
+ vf_clipfrac: a float
+ The ratio of vf being clipped
+
+ """
+ vpredclipped = verl_F.clip_by_value(vpreds, values - cliprange_value, values + cliprange_value)
+ vf_losses1 = (vpreds - returns)**2
+ vf_losses2 = (vpredclipped - returns)**2
+ vf_loss = 0.5 * verl_F.masked_mean(torch.max(vf_losses1, vf_losses2), eos_mask)
+ vf_clipfrac = verl_F.masked_mean(torch.gt(vf_losses2, vf_losses1).float(), eos_mask)
+ return vf_loss, vf_clipfrac
+
+
+def kl_penalty(logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor, kl_penalty) -> torch.FloatTensor:
+ """Compute KL divergence given logprob and ref_logprob.
+ Copied from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1104
+
+ Args:
+ logprob:
+ ref_logprob:
+
+ Returns:
+
+ """
+ if kl_penalty == "kl":
+ return logprob - ref_logprob
+
+ if kl_penalty == "abs":
+ return (logprob - ref_logprob).abs()
+
+ if kl_penalty == "mse":
+ return 0.5 * (logprob - ref_logprob).square()
+
+ # J. Schulman. Approximating kl divergence, 2020.
+ # # URL http://joschu.net/blog/kl-approx.html.
+ if kl_penalty == 'low_var_kl':
+ kl = ref_logprob - logprob
+ ratio = torch.exp(kl)
+ kld = (ratio - kl - 1).contiguous()
+ return torch.clamp(kld, min=-10, max=10)
+
+ if kl_penalty == "full":
+ # so, here logprob and ref_logprob should contain the logits for every token in vocabulary
+ raise NotImplementedError
+
+ raise NotImplementedError
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/ppo/ray_trainer.py b/code/RL_model/verl/Search-R1/verl/trainer/ppo/ray_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4304e0584813c36857265f17238eab38d4b816c3
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/ppo/ray_trainer.py
@@ -0,0 +1,867 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+FSDP PPO Trainer with Ray-based single controller.
+This trainer supports model-agonistic model initialization with huggingface
+"""
+
+import os
+import uuid
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from enum import Enum
+from pprint import pprint
+from typing import Type, Dict
+
+import re
+import json
+from collections import defaultdict
+
+import numpy as np
+from codetiming import Timer
+from omegaconf import OmegaConf, open_dict
+from verl import DataProto
+from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
+from verl.single_controller.base import Worker
+from verl.single_controller.ray import RayResourcePool, RayWorkerGroup, RayClassWithInitArgs
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo import core_algos
+from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
+
+import re
+from search_r1.llm_agent.generation import LLMGenerationManager, GenerationConfig
+
+WorkerType = Type[Worker]
+
+
+class Role(Enum):
+ """
+ To create more roles dynamically, you can subclass Role and add new members
+ """
+ Actor = 0
+ Rollout = 1
+ ActorRollout = 2
+ Critic = 3
+ RefPolicy = 4
+ RewardModel = 5
+ ActorRolloutRef = 6
+
+
+@dataclass
+class ResourcePoolManager:
+ """
+ Define a resource pool specification. Resource pool will be initialized first.
+ Mapping
+ """
+ resource_pool_spec: dict[str, list[int]]
+ mapping: dict[Role, str]
+ resource_pool_dict: dict[str, RayResourcePool] = field(default_factory=dict)
+
+ def create_resource_pool(self):
+ for resource_pool_name, process_on_nodes in self.resource_pool_spec.items():
+ # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool
+ # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one.
+ # For Megatron backend, we recommend using max_colocate_count>1 that can utilize different WorkerGroup for differnt models
+ resource_pool = RayResourcePool(process_on_nodes=process_on_nodes,
+ use_gpu=True,
+ max_colocate_count=1,
+ name_prefix=resource_pool_name)
+ self.resource_pool_dict[resource_pool_name] = resource_pool
+
+ def get_resource_pool(self, role: Role) -> RayResourcePool:
+ """Get the resource pool of the worker_cls"""
+ return self.resource_pool_dict[self.mapping[role]]
+
+
+import torch
+from verl.utils.torch_functional import masked_mean
+
+
+def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, kl_penalty='kl'):
+ responses = data.batch['responses']
+ response_length = responses.size(1)
+ token_level_scores = data.batch['token_level_scores']
+ batch_size = data.batch.batch_size[0]
+ attention_mask = data.batch['info_mask'] if 'info_mask' in data.batch else data.batch['attention_mask']
+ response_mask = attention_mask[:, -response_length:]
+
+ # compute kl between ref_policy and current policy
+ if 'ref_log_prob' in data.batch.keys():
+ kld = core_algos.kl_penalty(data.batch['old_log_probs'], data.batch['ref_log_prob'],
+ kl_penalty=kl_penalty) # (batch_size, response_length)
+ kld = kld * response_mask
+ beta = kl_ctrl.value
+ else:
+ beta = 0
+ kld = torch.zeros_like(response_mask, dtype=torch.float32)
+
+ token_level_rewards = token_level_scores - beta * kld
+
+ current_kl = masked_mean(kld, mask=response_mask, axis=-1) # average over sequence
+ current_kl = torch.mean(current_kl, dim=0).item()
+
+ # according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
+ kl_ctrl.update(current_kl=current_kl, n_steps=batch_size)
+ data.batch['token_level_rewards'] = token_level_rewards
+
+ metrics = {'critic/kl': current_kl, 'critic/kl_coeff': beta}
+
+ return data, metrics
+
+
+def compute_advantage(data: DataProto, adv_estimator, gamma=1.0, lam=1.0, num_repeat=1):
+ # prepare response group
+ # TODO: add other ways to estimate advantages
+ if adv_estimator == 'gae':
+ values = data.batch['values']
+ responses = data.batch['responses']
+ response_length = responses.size(-1)
+ attention_mask = data.batch['attention_mask']
+ response_mask = attention_mask[:, -response_length:]
+ token_level_rewards = data.batch['token_level_rewards']
+ advantages, returns = core_algos.compute_gae_advantage_return(token_level_rewards=token_level_rewards,
+ values=values,
+ eos_mask=response_mask,
+ gamma=gamma,
+ lam=lam)
+ data.batch['advantages'] = advantages
+ data.batch['returns'] = returns
+ elif adv_estimator == 'grpo':
+ token_level_rewards = data.batch['token_level_rewards']
+ index = data.non_tensor_batch['uid']
+ responses = data.batch['responses']
+ response_length = responses.size(-1)
+ attention_mask = data.batch['attention_mask']
+ response_mask = attention_mask[:, -response_length:]
+ advantages, returns = core_algos.compute_grpo_outcome_advantage(token_level_rewards=token_level_rewards,
+ eos_mask=response_mask,
+ index=index)
+ data.batch['advantages'] = advantages
+ data.batch['returns'] = returns
+ else:
+ raise NotImplementedError
+ return data
+
+
+def reduce_metrics(metrics: dict):
+ for key, val in metrics.items():
+ metrics[key] = np.mean(val)
+ return metrics
+
+
+def _compute_response_info(batch):
+ response_length = batch.batch['responses'].shape[-1]
+
+ prompt_mask = batch.batch['attention_mask'][:, :-response_length]
+ response_mask = batch.batch['attention_mask'][:, -response_length:]
+
+ prompt_length = prompt_mask.sum(-1).float()
+ response_length = response_mask.sum(-1).float() # (batch_size,)
+
+ return dict(
+ response_mask=response_mask,
+ prompt_length=prompt_length,
+ response_length=response_length,
+ )
+
+
+def compute_data_metrics(batch, use_critic=True):
+ # TODO: add response length
+ sequence_score = batch.batch['token_level_scores'].sum(-1)
+ sequence_reward = batch.batch['token_level_rewards'].sum(-1)
+
+ advantages = batch.batch['advantages']
+ returns = batch.batch['returns']
+
+ max_response_length = batch.batch['responses'].shape[-1]
+
+ prompt_mask = batch.batch['attention_mask'][:, :-max_response_length].bool()
+ response_mask = batch.batch['attention_mask'][:, -max_response_length:].bool()
+
+ max_prompt_length = prompt_mask.size(-1)
+
+ response_info = _compute_response_info(batch)
+ prompt_length = response_info['prompt_length']
+ response_length = response_info['response_length']
+
+ valid_adv = torch.masked_select(advantages, response_mask)
+ valid_returns = torch.masked_select(returns, response_mask)
+
+ if use_critic:
+ values = batch.batch['values']
+ valid_values = torch.masked_select(values, response_mask)
+ return_diff_var = torch.var(valid_returns - valid_values)
+ return_var = torch.var(valid_returns)
+
+ metrics = {
+ # score
+ 'critic/score/mean':
+ torch.mean(sequence_score).detach().item(),
+ 'critic/score/max':
+ torch.max(sequence_score).detach().item(),
+ 'critic/score/min':
+ torch.min(sequence_score).detach().item(),
+ # reward
+ 'critic/rewards/mean':
+ torch.mean(sequence_reward).detach().item(),
+ 'critic/rewards/max':
+ torch.max(sequence_reward).detach().item(),
+ 'critic/rewards/min':
+ torch.min(sequence_reward).detach().item(),
+ # adv
+ 'critic/advantages/mean':
+ torch.mean(valid_adv).detach().item(),
+ 'critic/advantages/max':
+ torch.max(valid_adv).detach().item(),
+ 'critic/advantages/min':
+ torch.min(valid_adv).detach().item(),
+ # returns
+ 'critic/returns/mean':
+ torch.mean(valid_returns).detach().item(),
+ 'critic/returns/max':
+ torch.max(valid_returns).detach().item(),
+ 'critic/returns/min':
+ torch.min(valid_returns).detach().item(),
+ **({
+ # values
+ 'critic/values/mean': torch.mean(valid_values).detach().item(),
+ 'critic/values/max': torch.max(valid_values).detach().item(),
+ 'critic/values/min': torch.min(valid_values).detach().item(),
+ # vf explained var
+ 'critic/vf_explained_var': (1.0 - return_diff_var / (return_var + 1e-5)).detach().item(),
+ } if use_critic else {}),
+
+ # response length
+ 'response_length/mean':
+ torch.mean(response_length).detach().item(),
+ 'response_length/max':
+ torch.max(response_length).detach().item(),
+ 'response_length/min':
+ torch.min(response_length).detach().item(),
+ 'response_length/clip_ratio':
+ torch.mean(torch.eq(response_length, max_response_length).float()).detach().item(),
+ # prompt length
+ 'prompt_length/mean':
+ torch.mean(prompt_length).detach().item(),
+ 'prompt_length/max':
+ torch.max(prompt_length).detach().item(),
+ 'prompt_length/min':
+ torch.min(prompt_length).detach().item(),
+ 'prompt_length/clip_ratio':
+ torch.mean(torch.eq(prompt_length, max_prompt_length).float()).detach().item(),
+ }
+
+ # metrics for actions
+ if 'turns_stats' in batch.meta_info:
+ metrics['env/number_of_actions/mean'] = float(np.array(batch.meta_info['turns_stats'], dtype=np.int16).mean())
+ metrics['env/number_of_actions/max'] = float(np.array(batch.meta_info['turns_stats'], dtype=np.int16).max())
+ metrics['env/number_of_actions/min'] = float(np.array(batch.meta_info['turns_stats'], dtype=np.int16).min())
+ if 'active_mask' in batch.meta_info:
+ metrics['env/finish_ratio'] = 1 - float(np.array(batch.meta_info['active_mask'], dtype=np.int16).mean())
+ if 'valid_action_stats' in batch.meta_info:
+ metrics['env/number_of_valid_action'] = float(np.array(batch.meta_info['valid_action_stats'], dtype=np.int16).mean())
+ metrics['env/ratio_of_valid_action'] = float((np.array(batch.meta_info['valid_action_stats'], dtype=np.int16) / np.array(batch.meta_info['turns_stats'], dtype=np.int16)).mean())
+ if 'valid_search_stats' in batch.meta_info:
+ metrics['env/number_of_valid_search'] = float(np.array(batch.meta_info['valid_search_stats'], dtype=np.int16).mean())
+
+
+ return metrics
+
+
+def compute_timing_metrics(batch, timing_raw):
+ response_info = _compute_response_info(batch)
+ num_prompt_tokens = torch.sum(response_info['prompt_length']).item()
+ num_response_tokens = torch.sum(response_info['response_length']).item()
+ num_overall_tokens = num_prompt_tokens + num_response_tokens
+
+ num_tokens_of_section = {
+ 'gen': num_response_tokens,
+ **{
+ name: num_overall_tokens for name in ['ref', 'values', 'adv', 'update_critic', 'update_actor', 'rollout']
+ },
+ }
+
+ return {
+ **{
+ f'timing_s/{name}': value for name, value in timing_raw.items()
+ },
+ **{
+ f'timing_per_token_ms/{name}': timing_raw[name] * 1000 / num_tokens_of_section[name] for name in set(num_tokens_of_section.keys(
+ )) & set(timing_raw.keys())
+ },
+ }
+
+
+@contextmanager
+def _timer(name: str, timing_raw: Dict[str, float]):
+ with Timer(name=name, logger=None) as timer:
+ yield
+ timing_raw[name] = timer.last
+
+
+class RayPPOTrainer(object):
+ """
+ Note that this trainer runs on the driver process on a single CPU/GPU node.
+ """
+
+ # TODO: support each role have individual ray_worker_group_cls,
+ # i.e., support different backend of different role
+ def __init__(self,
+ config,
+ tokenizer,
+ role_worker_mapping: dict[Role, WorkerType],
+ resource_pool_manager: ResourcePoolManager,
+ ray_worker_group_cls: RayWorkerGroup = RayWorkerGroup,
+ reward_fn=None,
+ val_reward_fn=None):
+
+ # assert torch.cuda.is_available(), 'cuda must be available on driver'
+
+ self.tokenizer = tokenizer
+ self.config = config
+ self.reward_fn = reward_fn
+ self.val_reward_fn = val_reward_fn
+
+ self.hybrid_engine = config.actor_rollout_ref.hybrid_engine
+ assert self.hybrid_engine, 'Currently, only support hybrid engine'
+
+ if self.hybrid_engine:
+ assert Role.ActorRollout in role_worker_mapping, f'{role_worker_mapping.keys()=}'
+
+ self.role_worker_mapping = role_worker_mapping
+ self.resource_pool_manager = resource_pool_manager
+ self.use_reference_policy = Role.RefPolicy in role_worker_mapping
+ self.use_rm = Role.RewardModel in role_worker_mapping
+ self.ray_worker_group_cls = ray_worker_group_cls
+
+ # define KL control
+ if self.use_reference_policy:
+ if config.algorithm.kl_ctrl.type == 'fixed':
+ self.kl_ctrl = core_algos.FixedKLController(kl_coef=config.algorithm.kl_ctrl.kl_coef)
+ elif config.algorithm.kl_ctrl.type == 'adaptive':
+ assert config.algorithm.kl_ctrl.horizon > 0, f'horizon must be larger than 0. Got {config.critic.kl_ctrl.horizon}'
+ self.kl_ctrl = core_algos.AdaptiveKLController(init_kl_coef=config.algorithm.kl_ctrl.kl_coef,
+ target_kl=config.algorithm.kl_ctrl.target_kl,
+ horizon=config.algorithm.kl_ctrl.horizon)
+ else:
+ raise NotImplementedError
+ else:
+ self.kl_ctrl = core_algos.FixedKLController(kl_coef=0.)
+
+ self._create_dataloader()
+ self._init_logger()
+
+ def _init_logger(self):
+ from verl.utils.tracking import Tracking
+ self.logger = Tracking(project_name=self.config.trainer.project_name,
+ experiment_name=self.config.trainer.experiment_name,
+ default_backend=self.config.trainer.logger,
+ config=OmegaConf.to_container(self.config, resolve=True))
+
+ def _create_dataloader(self):
+ from torch.utils.data import DataLoader
+ # TODO: we have to make sure the batch size is divisible by the dp size
+ from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+ self.train_dataset = RLHFDataset(parquet_files=self.config.data.train_files,
+ tokenizer=self.tokenizer,
+ prompt_key=self.config.data.prompt_key,
+ max_prompt_length=self.config.data.max_prompt_length,
+ filter_prompts=True,
+ return_raw_chat=self.config.data.get('return_raw_chat', False),
+ truncation='error')
+ if self.config.data.train_data_num is not None:
+ if self.config.data.train_data_num > len(self.train_dataset.dataframe):
+ print(f"[WARNING] training dataset size is smaller than desired size. Using the dataset as the original size {len(self.train_dataset.dataframe)}")
+ else:
+ self.train_dataset.dataframe = self.train_dataset.dataframe.sample(self.config.data.train_data_num, random_state=42)
+ print(f"filtered training dataset size: {len(self.train_dataset.dataframe)}")
+
+ self.train_dataloader = DataLoader(dataset=self.train_dataset,
+ batch_size=self.config.data.train_batch_size,
+ shuffle=self.config.data.shuffle_train_dataloader,
+ drop_last=True,
+ collate_fn=collate_fn)
+
+ self.val_dataset = RLHFDataset(parquet_files=self.config.data.val_files,
+ tokenizer=self.tokenizer,
+ prompt_key=self.config.data.prompt_key,
+ max_prompt_length=self.config.data.max_prompt_length,
+ filter_prompts=True,
+ return_raw_chat=self.config.data.get('return_raw_chat', False),
+ truncation='error')
+ if self.config.data.val_data_num is not None:
+ if self.config.data.val_data_num > len(self.val_dataset.dataframe):
+ print(f"[WARNING] validation dataset size is smaller than desired size. Using the dataset as the original size {len(self.val_dataset.dataframe)}")
+ else:
+ self.val_dataset.dataframe = self.val_dataset.dataframe.sample(self.config.data.val_data_num, random_state=42)
+ print(f"filtered validation dataset size: {len(self.val_dataset.dataframe)}")
+
+ self.val_dataloader = DataLoader(dataset=self.val_dataset,
+ batch_size=self.config.data.val_batch_size,
+ shuffle=False,
+ drop_last=True,
+ collate_fn=collate_fn)
+
+ print(f'Size of train dataloader: {len(self.train_dataloader)}')
+ print(f'Size of val dataloader: {len(self.val_dataloader)}')
+
+ assert len(self.train_dataloader) >= 1
+ assert len(self.val_dataloader) >= 1
+
+ # inject total_training_steps to actor/critic optim_config. This is hacky.
+ total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
+
+ if self.config.trainer.total_training_steps is not None:
+ total_training_steps = self.config.trainer.total_training_steps
+
+ self.total_training_steps = total_training_steps
+ print(f'Total training steps: {self.total_training_steps}')
+
+ OmegaConf.set_struct(self.config, True)
+ with open_dict(self.config):
+ self.config.actor_rollout_ref.actor.optim.total_training_steps = total_training_steps
+ self.config.critic.optim.total_training_steps = total_training_steps
+
+ def _validate(self):
+ """
+ The training loop of PPO with global metric computation.
+ Accumulates metrics across all batches before computing final statistics.
+ """
+ import torch
+ reward_tensor_lst = []
+ data_source_lst = []
+
+ gen_config = GenerationConfig(
+ max_turns=self.config.max_turns,
+ max_start_length=self.config.data.max_start_length,
+ max_prompt_length=self.config.data.max_prompt_length,
+ max_response_length=self.config.data.max_response_length,
+ max_obs_length=self.config.data.max_obs_length,
+ num_gpus=self.config.trainer.n_gpus_per_node * self.config.trainer.nnodes,
+ no_think_rl=self.config.algorithm.no_think_rl,
+ search_url = self.config.retriever.url,
+ topk = self.config.retriever.topk,
+ )
+
+ # Agent config preparation
+ generation_manager = LLMGenerationManager(
+ tokenizer=self.tokenizer,
+ actor_rollout_wg=self.actor_rollout_wg,
+ config=gen_config,
+ is_validation = True,
+ )
+
+ if not self.config.do_search:
+ for test_data in self.val_dataloader:
+ test_batch = DataProto.from_single_dict(test_data)
+
+ # we only do validation on rule-based rm
+ if self.config.reward_model.enable and test_batch[0].non_tensor_batch['reward_model']['style'] == 'model':
+ return {}
+
+ test_gen_batch = test_batch.pop(['input_ids', 'attention_mask', 'position_ids'])
+ test_gen_batch.meta_info = {
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ 'recompute_log_prob': False,
+ 'do_sample': False,
+ 'validate': True,
+ }
+
+ # pad to be divisible by dp_size
+ test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, self.actor_rollout_wg.world_size)
+ test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
+ # unpad
+ test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
+ print('validation generation end')
+
+ test_batch = test_batch.union(test_output_gen_batch)
+
+ # evaluate using reward_function
+ # for certain reward function (e.g. sandbox), the generation can overlap with reward
+ reward_tensor = self.val_reward_fn(test_batch)
+
+ reward_tensor_lst.append(reward_tensor)
+ data_source_lst.append(test_batch.non_tensor_batch.get('data_source', ['unknown'] * reward_tensor.shape[0]))
+ else:
+ for batch_dict in self.val_dataloader:
+ timing_raw = {}
+ test_batch: DataProto = DataProto.from_single_dict(batch_dict)
+ # test_batch = test_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n_agent, interleave=True)
+
+ test_gen_batch = test_batch.pop(batch_keys=['input_ids', 'attention_mask', 'position_ids'])
+ test_gen_batch.meta_info = {
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ 'recompute_log_prob': False,
+ 'do_sample': False,
+ 'validate': True,
+ }
+ with _timer('step', timing_raw):
+ first_input_ids = test_gen_batch.batch['input_ids'][:, -gen_config.max_start_length:].clone()
+ with _timer('gen', timing_raw):
+ generation_manager.timing_raw = timing_raw
+ final_gen_batch_output = generation_manager.run_llm_loop(
+ gen_batch=test_gen_batch,
+ initial_input_ids=first_input_ids,
+ )
+
+ test_batch = test_batch.union(final_gen_batch_output)
+
+ for key in test_batch.batch.keys():
+ test_batch.batch[key] = test_batch.batch[key].long()
+
+ # evaluate using reward_function
+ # for certain reward function (e.g. sandbox), the generation can overlap with reward
+ reward_tensor = self.val_reward_fn(test_batch)
+
+ reward_tensor_lst.append(reward_tensor)
+ data_source_lst.append(test_batch.non_tensor_batch.get('data_source', ['unknown'] * reward_tensor.shape[0]))
+
+ reward_tensor = torch.cat([rw.sum(-1) for rw in reward_tensor_lst], dim=0).cpu() # (batch_size,)
+ # reward_tensor = torch.cat(reward_tensor_lst, dim=0).sum(-1).cpu() # (batch_size,)
+ data_sources = np.concatenate(data_source_lst, axis=0)
+ # evaluate test_score based on data source
+ data_source_reward = {}
+ for i in range(reward_tensor.shape[0]):
+ data_source = data_sources[i]
+ if data_source not in data_source_reward:
+ data_source_reward[data_source] = []
+ data_source_reward[data_source].append(reward_tensor[i].item())
+
+ metric_dict = {}
+ for data_source, rewards in data_source_reward.items():
+ metric_dict[f'val/test_score/{data_source}'] = np.mean(rewards)
+
+ return metric_dict
+
+
+ def init_workers(self):
+ """Init resource pool and worker group"""
+ self.resource_pool_manager.create_resource_pool()
+
+ self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}
+
+ # create actor and rollout
+ if self.hybrid_engine:
+ resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
+ actor_rollout_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.ActorRollout],
+ config=self.config.actor_rollout_ref,
+ role='actor_rollout')
+ self.resource_pool_to_cls[resource_pool]['actor_rollout'] = actor_rollout_cls
+ else:
+ raise NotImplementedError
+
+ # create critic
+ if self.config.algorithm.adv_estimator == 'gae':
+ resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
+ critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=self.config.critic)
+ self.resource_pool_to_cls[resource_pool]['critic'] = critic_cls
+ self.use_critic = True
+
+ elif self.config.algorithm.adv_estimator == 'grpo':
+ self.use_critic = False
+ else:
+ raise NotImplementedError
+
+ # create reference policy if needed
+ if self.use_reference_policy:
+ resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
+ ref_policy_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RefPolicy],
+ config=self.config.actor_rollout_ref,
+ role='ref')
+ self.resource_pool_to_cls[resource_pool]['ref'] = ref_policy_cls
+
+ # create a reward model if reward_fn is None
+ if self.use_rm:
+ # we create a RM here
+ resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+ rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
+ self.resource_pool_to_cls[resource_pool]['rm'] = rm_cls
+
+ # initialize WorkerGroup
+ # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
+ # you should not use `create_colocated_worker_cls`. Instead, directly pass different resource pool to different worker groups.
+ # See https://github.com/volcengine/verl/blob/master/examples/ray/tutorial.ipynb for more information.
+ all_wg = {}
+ self.wg_dicts = []
+ for resource_pool, class_dict in self.resource_pool_to_cls.items():
+ worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+ wg_dict = self.ray_worker_group_cls(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls)
+ spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+ all_wg.update(spawn_wg)
+ # keep the referece of WorkerDict to support ray >= 2.31. Ref: https://github.com/ray-project/ray/pull/45699
+ self.wg_dicts.append(wg_dict)
+
+ if self.use_critic:
+ self.critic_wg = all_wg['critic']
+ self.critic_wg.init_model()
+
+ if self.use_reference_policy:
+ self.ref_policy_wg = all_wg['ref']
+ self.ref_policy_wg.init_model()
+
+ if self.use_rm:
+ self.rm_wg = all_wg['rm']
+ self.rm_wg.init_model()
+
+ # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
+ self.actor_rollout_wg = all_wg['actor_rollout']
+ self.actor_rollout_wg.init_model()
+
+ def _save_checkpoint(self):
+ actor_local_path = os.path.join(self.config.trainer.default_local_dir, 'actor',
+ f'global_step_{self.global_steps}')
+ actor_remote_path = None if self.config.trainer.default_hdfs_dir is None else os.path.join(
+ self.config.trainer.default_hdfs_dir, 'actor')
+ self.actor_rollout_wg.save_checkpoint(actor_local_path, actor_remote_path)
+
+ if self.use_critic:
+ critic_local_path = os.path.join(self.config.trainer.default_local_dir, 'critic',
+ f'global_step_{self.global_steps}')
+ critic_remote_path = None if self.config.trainer.default_hdfs_dir is None else os.path.join(
+ self.config.trainer.default_hdfs_dir, 'critic')
+ self.critic_wg.save_checkpoint(critic_local_path, critic_remote_path)
+
+ def _balance_batch(self, batch: DataProto, metrics, logging_prefix='global_seqlen'):
+ """Reorder the data on single controller such that each dp rank gets similar total tokens"""
+ attention_mask = batch.batch['attention_mask']
+ batch_size = attention_mask.shape[0]
+ global_seqlen_lst = attention_mask.view(batch_size, -1).sum(-1).tolist() # (train_batch_size,)
+ world_size = self.actor_rollout_wg.world_size
+ global_partition_lst = get_seqlen_balanced_partitions(global_seqlen_lst,
+ k_partitions=world_size,
+ equal_size=True)
+ # reorder based on index. The data will be automatically equally partitioned by dispatch function
+ global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
+ batch.reorder(global_idx)
+ global_balance_stats = log_seqlen_unbalance(seqlen_list=global_seqlen_lst,
+ partitions=global_partition_lst,
+ prefix=logging_prefix)
+ metrics.update(global_balance_stats)
+
+ def fit(self):
+ """
+ The training loop of PPO.
+ The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow.
+ The light-weight advantage computation is done on the driver process.
+ """
+
+ logger = self.logger
+ self.global_steps = 0
+ # perform validation before training
+ # currently, we only support validation using the reward_function.
+ if self.val_reward_fn is not None and self.config.trainer.get('val_before_train', True):
+ val_metrics = self._validate()
+ pprint(f'Initial validation metrics: {val_metrics}')
+ logger.log(data=val_metrics, step=self.global_steps)
+ if self.config.trainer.get('val_only', False):
+ return
+
+ # we start from step 1
+ self.global_steps += 1
+
+ # Agent config preparation
+ gen_config = GenerationConfig(
+ max_turns=self.config.max_turns,
+ max_start_length=self.config.data.max_start_length,
+ max_prompt_length=self.config.data.max_prompt_length,
+ max_response_length=self.config.data.max_response_length,
+ max_obs_length=self.config.data.max_obs_length,
+ num_gpus=self.config.trainer.n_gpus_per_node * self.config.trainer.nnodes,
+ no_think_rl=self.config.algorithm.no_think_rl,
+ search_url = self.config.retriever.url,
+ topk = self.config.retriever.topk,
+ )
+
+ generation_manager = LLMGenerationManager(
+ tokenizer=self.tokenizer,
+ actor_rollout_wg=self.actor_rollout_wg,
+ config=gen_config,
+ )
+
+ # start training loop
+ for epoch in range(self.config.trainer.total_epochs):
+ for batch_dict in self.train_dataloader:
+ print(f'epoch {epoch}, step {self.global_steps}')
+ metrics = {}
+ timing_raw = {}
+
+ batch: DataProto = DataProto.from_single_dict(batch_dict)
+ batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n_agent, interleave=True)
+
+ # pop those keys for generation
+ gen_batch = batch.pop(batch_keys=['input_ids', 'attention_mask', 'position_ids'])
+
+ ####################
+ # original code here
+
+ with _timer('step', timing_raw):
+ if not self.config.do_search:
+ gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
+
+ batch.non_tensor_batch['uid'] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))],
+ dtype=object)
+ # repeat to align with repeated responses in rollout
+ batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+ batch = batch.union(gen_batch_output)
+
+ ####################
+ # Below is aLL about agents - the "LLM + forloop"
+ ####################
+ # with _timer('step', timing_raw):
+ else:
+ first_input_ids = gen_batch.batch['input_ids'][:, -gen_config.max_start_length:].clone().long()
+
+ with _timer('gen', timing_raw):
+ generation_manager.timing_raw = timing_raw
+ final_gen_batch_output = generation_manager.run_llm_loop(
+ gen_batch=gen_batch,
+ initial_input_ids=first_input_ids,
+ )
+
+ # final_gen_batch_output.batch.apply(lambda x: x.long(), inplace=True)
+ for key in final_gen_batch_output.batch.keys():
+ final_gen_batch_output.batch[key] = final_gen_batch_output.batch[key].long()
+
+ with torch.no_grad():
+ output = self.actor_rollout_wg.compute_log_prob(final_gen_batch_output)
+ final_gen_batch_output = final_gen_batch_output.union(output)
+
+ # batch.non_tensor_batch['uid'] = np.array([str(uuid.uuid4()) for _ in range(len(batch.batch))],
+ # dtype=object)
+ batch.non_tensor_batch['uid'] = batch.non_tensor_batch['index'].copy()
+
+ # repeat to align with repeated responses in rollout
+ batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
+ batch = batch.union(final_gen_batch_output)
+
+ ####################
+ ####################
+
+ # balance the number of valid tokens on each dp rank.
+ # Note that this breaks the order of data inside the batch.
+ # Please take care when you implement group based adv computation such as GRPO and rloo
+ self._balance_batch(batch, metrics=metrics)
+
+ # compute global_valid tokens
+ batch.meta_info['global_token_num'] = torch.sum(batch.batch['attention_mask'], dim=-1).tolist()
+
+ # batch.batch.apply(lambda x, key: x.long() if key != "old_log_probs" else x, inplace=True, key=True)
+ for key in batch.batch.keys():
+ if key != 'old_log_probs':
+ batch.batch[key] = batch.batch[key].long()
+
+ if self.use_reference_policy:
+ # compute reference log_prob
+ with _timer('ref', timing_raw):
+ ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+ batch = batch.union(ref_log_prob)
+
+ # compute values
+ if self.use_critic:
+ with _timer('values', timing_raw):
+ values = self.critic_wg.compute_values(batch)
+ batch = batch.union(values)
+
+ with _timer('adv', timing_raw):
+ # compute scores. Support both model and function-based.
+ # We first compute the scores using reward model. Then, we call reward_fn to combine
+ # the results from reward model and rule-based results.
+ if self.use_rm:
+ # we first compute reward model score
+ reward_tensor = self.rm_wg.compute_rm_score(batch)
+ batch = batch.union(reward_tensor)
+
+ # we combine with rule-based rm
+ reward_tensor = self.reward_fn(batch)
+ batch.batch['token_level_scores'] = reward_tensor
+
+ # compute rewards. apply_kl_penalty if available
+ if not self.config.actor_rollout_ref.actor.use_kl_loss:
+ batch, kl_metrics = apply_kl_penalty(batch,
+ kl_ctrl=self.kl_ctrl,
+ kl_penalty=self.config.algorithm.kl_penalty)
+ metrics.update(kl_metrics)
+ else:
+ batch.batch['token_level_rewards'] = batch.batch['token_level_scores']
+
+ # compute advantages, executed on the driver process
+ batch = compute_advantage(batch,
+ adv_estimator=self.config.algorithm.adv_estimator,
+ gamma=self.config.algorithm.gamma,
+ lam=self.config.algorithm.lam,
+ num_repeat=self.config.actor_rollout_ref.rollout.n)
+
+ # update critic
+ if self.use_critic:
+ with _timer('update_critic', timing_raw):
+ critic_output = self.critic_wg.update_critic(batch)
+ critic_output_metrics = reduce_metrics(critic_output.meta_info['metrics'])
+ metrics.update(critic_output_metrics)
+
+ # implement critic warmup
+ if self.config.trainer.critic_warmup <= self.global_steps:
+ # update actor
+ with _timer('update_actor', timing_raw):
+ if self.config.do_search and self.config.actor_rollout_ref.actor.state_masking:
+ batch, metrics = self._create_loss_mask(batch, metrics)
+ actor_output = self.actor_rollout_wg.update_actor(batch)
+ actor_output_metrics = reduce_metrics(actor_output.meta_info['metrics'])
+ metrics.update(actor_output_metrics)
+
+ # validate
+ if self.val_reward_fn is not None and self.config.trainer.test_freq > 0 and \
+ self.global_steps % self.config.trainer.test_freq == 0:
+ with _timer('testing', timing_raw):
+ val_metrics: dict = self._validate()
+ metrics.update(val_metrics)
+
+ if self.config.trainer.save_freq > 0 and \
+ self.global_steps % self.config.trainer.save_freq == 0:
+ with _timer('save_checkpoint', timing_raw):
+ self._save_checkpoint()
+
+ # collect metrics
+ metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
+ metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
+
+ # TODO: make a canonical logger that supports various backend
+ logger.log(data=metrics, step=self.global_steps)
+
+ self.global_steps += 1
+
+ if self.global_steps >= self.total_training_steps:
+
+ # perform validation after training
+ if self.val_reward_fn is not None:
+ val_metrics = self._validate()
+ pprint(f'Final validation metrics: {val_metrics}')
+ logger.log(data=val_metrics, step=self.global_steps)
+ return
+
+ def _create_loss_mask(self, batch, metrics):
+ """Create loss mask for state tokens."""
+ response_length = batch.batch['responses'].shape[-1]
+ response_mask = batch.batch['attention_mask'][:, -response_length:]
+
+ loss_mask = batch.batch['info_mask'][:, -response_length:]
+ batch.batch['loss_mask'] = loss_mask
+
+ metrics.update({
+ 'state_tokens/total': loss_mask.sum().item(),
+ 'state_tokens/coverage': (loss_mask.sum() / response_mask.sum()).item(),
+ })
+
+ return batch, metrics
diff --git a/code/RL_model/verl/Search-R1/verl/trainer/runtime_env.yaml b/code/RL_model/verl/Search-R1/verl/trainer/runtime_env.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87bd05a9aabbc5db602626895518bb19add408d1
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/trainer/runtime_env.yaml
@@ -0,0 +1,5 @@
+working_dir: ./
+excludes: ["/.git/"]
+env_vars:
+ TORCH_NCCL_AVOID_RECORD_STREAMS: "1"
+ VLLM_ATTENTION_BACKEND: "XFORMERS"
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e453070a16370cd7006e0a7700c8550a56f19051
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import tokenizer
+from .tokenizer import *
+
+__all__ = tokenizer.__all__
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/config.py b/code/RL_model/verl/Search-R1/verl/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9298c42adf89467d047a3d0fdf8919bf772a5a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/config.py
@@ -0,0 +1,23 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from omegaconf import DictConfig
+
+
+def update_dict_with_config(dictionary: Dict, config: DictConfig):
+ for key in dictionary:
+ if hasattr(config, key):
+ dictionary[key] = getattr(config, key)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/dataset/README.md b/code/RL_model/verl/Search-R1/verl/utils/dataset/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f886a70aabf443fb167453d667529b62f3311765
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/dataset/README.md
@@ -0,0 +1,16 @@
+# Dataset Format
+## RLHF dataset
+We combine all the data sources into a single parquet files. We directly organize the prompt into the chat format so that multi-turn chats can be easily incorporated. In the prompt, we may add instruction following texts to guide the model output the answers in a particular format so that we can extract the answers.
+
+Math problems
+```json
+{
+ "data_source": "openai/gsm8k",
+ "prompt": [{"role": "user", "content": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Let's think step by step and output the final answer after \"####\""}],
+ "ability": "math",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": ["72"]
+ },
+}
+```
diff --git a/code/RL_model/verl/Search-R1/verl/utils/dataset/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f9b71c54c253a1cfabc7e9942ece086ec84903
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/dataset/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .rl_dataset import RLHFDataset
+from .rm_dataset import RMDataset
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/dataset/rl_dataset.py b/code/RL_model/verl/Search-R1/verl/utils/dataset/rl_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5f65f4a841c4272ce3311a9f01b52ea60b1351
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/dataset/rl_dataset.py
@@ -0,0 +1,155 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf import ListConfig
+import os
+from typing import List, Union
+
+import pandas as pd
+
+import torch
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from verl.utils.fs import copy_local_path_from_hdfs
+
+from verl.utils.model import compute_position_id_with_mask
+import verl.utils.torch_functional as verl_F
+
+
+def collate_fn(data_list: list[dict]) -> dict:
+ tensors = {}
+ non_tensors = {}
+
+ for data in data_list:
+ for key, val in data.items():
+ if isinstance(val, torch.Tensor):
+ if key not in tensors:
+ tensors[key] = []
+ tensors[key].append(val)
+ else:
+ if key not in non_tensors:
+ non_tensors[key] = []
+ non_tensors[key].append(val)
+
+ for key, val in tensors.items():
+ tensors[key] = torch.stack(val, dim=0)
+
+ for key, val in non_tensors.items():
+ non_tensors[key] = np.array(val, dtype=object)
+
+ output = {}
+ output.update(tensors)
+ output.update(non_tensors)
+ return output
+
+
+class RLHFDataset(Dataset):
+ """
+ We assume the dataset contains a column that contains prompts and other information
+ """
+
+ def __init__(self,
+ parquet_files: Union[str, List[str]],
+ tokenizer: PreTrainedTokenizer,
+ prompt_key='prompt',
+ max_prompt_length=1024,
+ filter_prompts=True,
+ cache_dir='~/.cache/verl/rlhf',
+ chat_template_func=None,
+ return_raw_chat=False,
+ truncation='error'):
+ if not isinstance(parquet_files, (List, ListConfig)):
+ parquet_files = [parquet_files]
+
+ self.parquet_files = parquet_files
+ self.cache_dir = os.path.expanduser(cache_dir)
+ self.tokenizer = tokenizer
+
+ self.prompt_key = prompt_key
+ self.max_prompt_length = max_prompt_length
+ self.filter_prompts = filter_prompts
+
+ self.return_raw_chat = return_raw_chat
+ self.chat_template_func = chat_template_func
+ self.truncation = truncation
+
+ self._download()
+ self._read_files_and_tokenize()
+
+ def _download(self):
+ from verl.utils.fs import copy_local_path_from_hdfs
+ for i, parquet_file in enumerate(self.parquet_files):
+ self.parquet_files[i] = copy_local_path_from_hdfs(src=parquet_file, cache_dir=self.cache_dir)
+
+ def _read_files_and_tokenize(self):
+ dataframes = []
+ for parquet_file in self.parquet_files:
+ # read parquet files and cache
+ dataframe = pd.read_parquet(parquet_file)
+ dataframes.append(dataframe)
+ self.dataframe = pd.concat(dataframes)
+
+ print(f'original dataset len: {len(self.dataframe)}')
+
+ # filter out too long prompts
+ tokenizer = self.tokenizer
+ prompt_key = self.prompt_key
+
+ # nvm if prompt is too long
+ # self.dataframe = self.dataframe[self.dataframe.apply(lambda doc: len(
+ # tokenizer.apply_chat_template(doc[prompt_key], add_generation_prompt=True)) <= self.max_prompt_length,
+ # axis=1)]
+
+ print(f'filter dataset len: {len(self.dataframe)}')
+
+ def __len__(self):
+ return len(self.dataframe)
+
+ def __getitem__(self, item):
+ """
+ Note that we also return the raw_input_ids so that it can be combined with other chat template
+ """
+ row_dict = self.dataframe.iloc[item].to_dict()
+
+ chat = row_dict.pop(self.prompt_key)
+
+ if self.tokenizer.chat_template:
+ prompt_with_chat_template = self.tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)
+ else:
+ prompt_with_chat_template = chat[0]['content']
+ # prompt_with_chat_template = chat
+
+ input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(prompt=prompt_with_chat_template,
+ tokenizer=self.tokenizer,
+ max_length=self.max_prompt_length,
+ pad_token_id=self.tokenizer.pad_token_id,
+ left_pad=True,
+ truncation=self.truncation)
+
+ position_ids = compute_position_id_with_mask(attention_mask)
+
+ row_dict['input_ids'] = input_ids[0]
+ row_dict['attention_mask'] = attention_mask[0]
+ row_dict['position_ids'] = position_ids[0]
+
+ # encode prompts without chat template
+ if self.return_raw_chat:
+ row_dict['raw_prompt'] = chat.tolist()
+
+ # add index for each prompt
+ index = row_dict.get("extra_info", {}).get("index", 0)
+ row_dict["index"] = index
+
+ return row_dict
diff --git a/code/RL_model/verl/Search-R1/verl/utils/dataset/rm_dataset.py b/code/RL_model/verl/Search-R1/verl/utils/dataset/rm_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba178db3d816b5291d836cbc4b30fed5b817944
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/dataset/rm_dataset.py
@@ -0,0 +1,143 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Union
+
+import pandas as pd
+
+import torch
+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+
+from verl.utils import hf_tokenizer
+
+
+def download_files_distributed(download_fn):
+ import torch.distributed
+ if torch.distributed.is_initialized():
+ if torch.distributed.get_rank() == 0:
+ # download files
+ download_fn()
+
+ torch.distributed.barrier()
+ else:
+ # download anyway
+ download_fn()
+
+
+class RMDataset(Dataset):
+
+ def __init__(self,
+ parquet_files: Union[str, List[str]],
+ tokenizer,
+ prompt_key='prompt',
+ chosen_key='chosen',
+ rejected_key='rejected',
+ max_length=1024,
+ add_eos=True,
+ cache_dir='~/.cache/verl/rm'):
+ if not isinstance(parquet_files, List):
+ parquet_files = [parquet_files]
+
+ self.parquet_files = parquet_files
+ self.cache_dir = os.path.expanduser(cache_dir)
+ if isinstance(tokenizer, str):
+ tokenizer = hf_tokenizer(tokenizer)
+ self.tokenizer = tokenizer
+
+ self.prompt_key = prompt_key
+ self.chosen_key = chosen_key
+ self.rejected_key = rejected_key
+
+ self.add_eos = add_eos
+ self.max_length = max_length
+
+ self._download()
+ self._read_files_and_tokenize()
+
+ def _download(self):
+
+ def _download_files():
+ from verl.utils.fs import copy, _is_non_local
+ os.makedirs(self.cache_dir, exist_ok=True)
+ assert os.path.exists(self.cache_dir)
+ for i, parquet_file in enumerate(self.parquet_files):
+ if _is_non_local(parquet_file):
+ dst = os.path.join(self.cache_dir, os.path.basename(parquet_file))
+ if not os.path.exists(dst):
+ copy(src=parquet_file, dst=dst)
+ self.parquet_files[i] = dst
+
+ download_files_distributed(_download_files)
+
+ def _read_files_and_tokenize(self):
+ dataframes = []
+ for parquet_file in self.parquet_files:
+ # read parquet files and cache
+ dataframe = pd.read_parquet(parquet_file)
+ dataframes.append(dataframe)
+ self.dataframe = pd.concat(dataframes)
+ self.prompts = self.dataframe[self.prompt_key].tolist()
+ self.chosen_responses = self.dataframe[self.chosen_key].tolist()
+ self.rejected_responses = self.dataframe[self.rejected_key].tolist()
+
+ def __len__(self):
+ return len(self.prompts)
+
+ def _pad_to_length(self, input_ids, attention_mask):
+ curr_length = input_ids.shape[-1]
+
+ if curr_length < self.max_length:
+ input_ids = torch.cat(
+ (input_ids, torch.zeros(size=(self.max_length - curr_length,), dtype=input_ids.dtype)), dim=-1)
+ attention_mask = torch.cat(
+ (attention_mask, torch.zeros(size=(self.max_length - curr_length,), dtype=attention_mask.dtype)),
+ dim=-1)
+ elif curr_length > self.max_length:
+ input_ids = input_ids[:self.max_length]
+ attention_mask = attention_mask[:self.max_length]
+
+ return input_ids, attention_mask
+
+ def __getitem__(self, item):
+ prompt = self.prompts[item]
+ chosen_response = self.chosen_responses[item]
+ rejected_response = self.rejected_responses[item]
+
+ prompt_ids = self.tokenizer(prompt, return_tensors='pt')['input_ids'][0]
+ chosen_response_ids = self.tokenizer(chosen_response, return_tensors='pt')['input_ids'][0]
+ rejected_response_ids = self.tokenizer(rejected_response, return_tensors='pt')['input_ids'][0]
+
+ if self.add_eos:
+ chosen_response_ids = torch.cat((chosen_response_ids, torch.tensor([self.tokenizer.eos_token_id])), dim=-1)
+ rejected_response_ids = torch.cat((rejected_response_ids, torch.tensor([self.tokenizer.eos_token_id])),
+ dim=-1)
+
+ chosen_input_ids = torch.cat((prompt_ids, chosen_response_ids), dim=-1)
+ chosen_attention_mask = torch.ones_like(chosen_input_ids)
+
+ rejected_input_ids = torch.cat((prompt_ids, rejected_response_ids), dim=-1)
+ rejected_attention_mask = torch.ones_like(rejected_input_ids)
+
+ chosen_input_ids, chosen_attention_mask = self._pad_to_length(chosen_input_ids, chosen_attention_mask)
+ rejected_input_ids, rejected_attention_mask = self._pad_to_length(rejected_input_ids, rejected_attention_mask)
+
+ input_ids = torch.stack((chosen_input_ids, rejected_input_ids), dim=0)
+ attention_mask = torch.stack((rejected_input_ids, rejected_attention_mask), dim=0)
+
+ return {
+ 'input_ids': input_ids,
+ 'attention_mask': attention_mask,
+ }
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/debug/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/debug/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0b3432eb4d6200ed84da0f735afa46735ef58e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/debug/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .performance import log_gpu_memory_usage
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/debug/performance.py b/code/RL_model/verl/Search-R1/verl/utils/debug/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..615475a66a5e45853540df2efd09c25991b43e12
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/debug/performance.py
@@ -0,0 +1,30 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+import logging
+
+
+def log_gpu_memory_usage(head: str, logger: logging.Logger = None, level=logging.DEBUG, rank: int = 0):
+ if (not dist.is_initialized()) or (rank is None) or (dist.get_rank() == rank):
+ memory_allocated = torch.cuda.memory_allocated() / 1024**3
+ memory_reserved = torch.cuda.memory_reserved() / 1024**3
+
+ message = f'{head}, memory allocated (GB): {memory_allocated}, memory reserved (GB): {memory_reserved}'
+
+ if logger is None:
+ print(message)
+ else:
+ logger.log(msg=message, level=level)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/debug/trajectory_tracker.py b/code/RL_model/verl/Search-R1/verl/utils/debug/trajectory_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..33b254685221a86b03f120b57659cd55b29ea0a2
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/debug/trajectory_tracker.py
@@ -0,0 +1,108 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Trajectory tracker can be inserted into code to save the intermediate results.
+The results will be dump to hdfs for offline comparison.
+Each process will have a client that first move all the tensors to CPU
+"""
+
+from verl.utils.hdfs_io import makedirs, copy
+import torch
+import os
+import ray
+import io
+import tempfile
+
+from collections import deque
+
+remote_copy = ray.remote(copy)
+
+
+@ray.remote
+def save_to_hdfs(data: io.BytesIO, name, hdfs_dir, verbose):
+ filename = name + '.pth'
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ local_filepath = os.path.join(tmpdirname, filename)
+ with open(local_filepath, 'wb') as f:
+ f.write(data.getbuffer())
+ # upload to hdfs
+
+ if verbose:
+ print(f'Saving {local_filepath} to {hdfs_dir}')
+ try:
+ copy(local_filepath, hdfs_dir)
+ except Exception as e:
+ print(e)
+
+
+@ray.remote
+class TrajectoryTracker():
+
+ def __init__(self, hdfs_dir, verbose) -> None:
+ self.hdfs_dir = hdfs_dir
+ makedirs(hdfs_dir)
+ self.verbose = verbose
+
+ self.handle = deque()
+
+ def dump(self, data: io.BytesIO, name):
+ # get a temp file and write to it
+ self.handle.append(save_to_hdfs.remote(data, name, self.hdfs_dir, self.verbose))
+
+ def wait_for_hdfs(self):
+ while len(self.handle) != 0:
+ future = self.handle.popleft()
+ ray.get(future)
+
+
+def dump_data(data, name):
+ enable = os.getenv('VERL_ENABLE_TRACKER', '0') == '1'
+ if not enable:
+ return
+ buffer = io.BytesIO()
+ torch.save(data, buffer)
+ tracker = get_trajectory_tracker()
+ ray.get(tracker.dump.remote(buffer, name))
+
+
+def get_trajectory_tracker():
+ hdfs_dir = os.getenv('VERL_TRACKER_HDFS_DIR', default=None)
+ verbose = os.getenv('VERL_TRACKER_VERBOSE', default='0') == '1'
+ assert hdfs_dir is not None
+ tracker = TrajectoryTracker.options(name="global_tracker", get_if_exists=True,
+ lifetime="detached").remote(hdfs_dir, verbose)
+ return tracker
+
+
+if __name__ == '__main__':
+ # testing
+ os.environ['VERL_ENABLE_TRACKER'] = '1'
+ os.environ['VERL_TRACKER_HDFS_DIR'] = '~/debug/test'
+
+ @ray.remote
+ def process(iter):
+ data = {'obs': torch.randn(10, 20)}
+ dump_data(data, f'process_{iter}_obs')
+
+ ray.init()
+
+ output_lst = []
+
+ for i in range(10):
+ output_lst.append(process.remote(i))
+
+ out = ray.get(output_lst)
+
+ tracker = get_trajectory_tracker()
+ ray.get(tracker.wait_for_hdfs.remote())
diff --git a/code/RL_model/verl/Search-R1/verl/utils/distributed.py b/code/RL_model/verl/Search-R1/verl/utils/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fea5a29cd943ef91c8f27f44db2a69e40702cf7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/distributed.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for distributed training."""
+import os
+
+
+def initialize_global_process_group(timeout_second=36000):
+ import torch.distributed
+ from datetime import timedelta
+ torch.distributed.init_process_group('nccl', timeout=timedelta(seconds=timeout_second))
+ local_rank = int(os.environ["LOCAL_RANK"])
+ rank = int(os.environ["RANK"])
+ world_size = int(os.environ["WORLD_SIZE"])
+
+ if torch.distributed.is_initialized():
+ torch.cuda.set_device(local_rank)
+ return local_rank, rank, world_size
diff --git a/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py b/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c5ac1a91160fc3265589fb6e93e93c8c1efb53e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/flops_counter.py
@@ -0,0 +1,123 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import PretrainedConfig, Qwen2Config, LlamaConfig
+
+VALID_CONFIG_TYPE = (Qwen2Config, LlamaConfig)
+
+
+def get_device_flops(unit="T"):
+
+ def unit_convert(number, level):
+ units = ["B", "K", "M", "G", "T", "P"]
+ if number <= 0:
+ return number
+ ptr = 0
+ while ptr < len(units) and units[ptr] != level:
+ number /= 1000
+ ptr += 1
+ return number
+
+ device_name = torch.cuda.get_device_name()
+ flops = float("inf") # INF flops for unkown gpu type
+ if "H100" in device_name or "H800" in device_name:
+ flops = 989e12
+ elif "A100" in device_name or "A800" in device_name:
+ flops = 312e12
+ elif "L40" in device_name:
+ flops = 181.05e12
+ elif "L20" in device_name:
+ flops = 119.5e12
+ elif "H20" in device_name:
+ flops = 148e12
+ elif "910B" in device_name:
+ flops = 354e12
+ flops_unit = unit_convert(flops, unit)
+ return flops_unit
+
+
+class FlopsCounter:
+ """
+ Used to count mfu during training loop
+
+ Example:
+ flops_counter = FlopsCounter(config)
+ flops_achieved, flops_promised = flops_counter.estimate_flops(tokens_list, delta_time)
+
+ """
+
+ def __init__(self, config: PretrainedConfig):
+ if not isinstance(config, VALID_CONFIG_TYPE):
+ print(f"Only support config type of {VALID_CONFIG_TYPE}, but got {type(config)}. "
+ f"MFU will always be zero.")
+
+ self.estimate_func = {"qwen2": self._estimate_qwen2_flops, 'llama': self._estimate_qwen2_flops}
+ self.config = config
+
+ def _estimate_unknown_flops(self, tokens_sum, batch_seqlens, delta_time):
+ return 0
+
+ def _estimate_qwen2_flops(self, tokens_sum, batch_seqlens, delta_time):
+ assert isinstance(self.config, (Qwen2Config, LlamaConfig))
+ hidden_size = self.config.hidden_size
+ vocab_size = self.config.vocab_size
+ num_hidden_layers = self.config.num_hidden_layers
+ num_key_value_heads = self.config.num_key_value_heads
+ num_attention_heads = self.config.num_attention_heads
+ intermediate_size = self.config.intermediate_size
+
+ head_dim = hidden_size // num_attention_heads
+ q_size = num_attention_heads * head_dim
+ k_size = num_key_value_heads * head_dim
+ v_size = num_key_value_heads * head_dim
+
+ # non-attn per layer parm
+ # Qwen2/LLama use SwiGelu, gate, having up and down linear layer in mlp
+ mlp_N = hidden_size * intermediate_size * 3
+ attn_linear_N = hidden_size * (q_size + k_size + v_size + num_attention_heads * head_dim)
+ emd_and_lm_head_N = vocab_size * hidden_size * 2
+ # non-attn all_layer parm
+ dense_N = (mlp_N + attn_linear_N) * num_hidden_layers + emd_and_lm_head_N
+ # non-attn all_layer & all_token fwd & bwd flops
+ dense_N_flops = 6 * dense_N * tokens_sum
+
+ # attn all_layer & all_token fwd & bwd flops
+ seqlen_square_sum = 0
+ for seqlen in batch_seqlens:
+ seqlen_square_sum += seqlen * seqlen
+ attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_attention_heads * num_hidden_layers
+
+ # all_layer & all_token fwd & bwd flops
+ flops_all_token = dense_N_flops + attn_qkv_flops
+ flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12
+ return flops_achieved
+
+ def estimate_flops(self, batch_seqlens, delta_time):
+ """
+ Estimate the FLOPS based on the number of valid tokens in the current batch and the time taken.
+
+ Args:
+ batch_seqlens (List[int]): A list where each element represents the number of valid tokens in the current batch.
+ delta_time (float): The time taken to process the batch, in seconds.
+
+ Returns:
+ estimated_flops (float): The estimated FLOPS based on the input tokens and time.
+ promised_flops (float): The expected FLOPS of the current device.
+ """
+ tokens_sum = sum(batch_seqlens)
+ func = self.estimate_func.get(self.config.model_type, self._estimate_unknown_flops)
+ estimated_flops = func(tokens_sum, batch_seqlens, delta_time)
+ promised_flops = get_device_flops()
+ return estimated_flops, promised_flops
diff --git a/code/RL_model/verl/Search-R1/verl/utils/fs.py b/code/RL_model/verl/Search-R1/verl/utils/fs.py
new file mode 100644
index 0000000000000000000000000000000000000000..80c1889be3582fffcdef5267f5e9ac55e1d7e059
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/fs.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+"""File-system agnostic IO APIs"""
+import os
+import tempfile
+import hashlib
+
+from .hdfs_io import copy, makedirs, exists
+
+__all__ = ["copy", "exists", "makedirs"]
+
+_HDFS_PREFIX = "hdfs://"
+
+
+def _is_non_local(path):
+ return path.startswith(_HDFS_PREFIX)
+
+
+def md5_encode(path: str) -> str:
+ return hashlib.md5(path.encode()).hexdigest()
+
+
+def get_local_temp_path(hdfs_path: str, cache_dir: str) -> str:
+ """Return a local temp path that joins cache_dir and basename of hdfs_path
+
+ Args:
+ hdfs_path:
+ cache_dir:
+
+ Returns:
+
+ """
+ # make a base64 encoding of hdfs_path to avoid directory conflict
+ encoded_hdfs_path = md5_encode(hdfs_path)
+ temp_dir = os.path.join(cache_dir, encoded_hdfs_path)
+ os.makedirs(temp_dir, exist_ok=True)
+ dst = os.path.join(temp_dir, os.path.basename(hdfs_path))
+ return dst
+
+
+def copy_local_path_from_hdfs(src: str, cache_dir=None, filelock='.file.lock', verbose=False) -> str:
+ """Copy src from hdfs to local if src is on hdfs or directly return src.
+ If cache_dir is None, we will use the default cache dir of the system. Note that this may cause conflicts if
+ the src name is the same between calls
+
+ Args:
+ src (str): a HDFS path of a local path
+
+ Returns:
+ a local path of the copied file
+ """
+ from filelock import FileLock
+
+ assert src[-1] != '/', f'Make sure the last char in src is not / because it will cause error. Got {src}'
+
+ if _is_non_local(src):
+ # download from hdfs to local
+ if cache_dir is None:
+ # get a temp folder
+ cache_dir = tempfile.gettempdir()
+ os.makedirs(cache_dir, exist_ok=True)
+ assert os.path.exists(cache_dir)
+ local_path = get_local_temp_path(src, cache_dir)
+ # get a specific lock
+ filelock = md5_encode(src) + '.lock'
+ lock_file = os.path.join(cache_dir, filelock)
+ with FileLock(lock_file=lock_file):
+ if not os.path.exists(local_path):
+ if verbose:
+ print(f'Copy from {src} to {local_path}')
+ copy(src, local_path)
+ return local_path
+ else:
+ return src
diff --git a/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py b/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0243cd15c2d2defe8e54164c6e07a05c5f6232d
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/fsdp_utils.py
@@ -0,0 +1,329 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+import functools
+import json
+import math
+import itertools
+import os
+from contextlib import contextmanager
+from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
+from transformers.trainer_pt_utils import get_module_class_from_name
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+
+def init_fn(x: torch.nn.Module):
+ if not torch.distributed.get_rank() == 0:
+ x = x.to_empty(device=torch.cuda.current_device(), recurse=False)
+ torch.cuda.empty_cache()
+ return x
+
+
+def get_init_weight_context_manager(use_meta_tensor=True):
+ from accelerate import init_empty_weights
+ cpu_init_weights = lambda: torch.device('cpu')
+ if use_meta_tensor:
+ init_context = init_empty_weights if torch.distributed.get_rank() != 0 else cpu_init_weights
+ else:
+ init_context = cpu_init_weights
+ return init_context
+
+
+# Copyright 2020-present the HuggingFace Inc. team.
+# Adapted from https://github.com/huggingface/transformers/src/transformers/trainer.py
+def get_fsdp_wrap_policy(module, config=None, is_lora=False):
+ """Get FSDP wrap policy for the module.
+
+ Args:
+ module: The module to get wrap policy for
+ config: Configuration for wrap policy
+ is_lora: Whether to enable lambda policy for LoRA modules
+ """
+ if config is None:
+ config = {}
+
+ if config.get('disable', False):
+ return None
+
+ default_transformer_cls_names_to_wrap = getattr(module, "_no_split_modules", None)
+ fsdp_transformer_layer_cls_to_wrap = config.get("transformer_layer_cls_to_wrap",
+ default_transformer_cls_names_to_wrap)
+ min_num_params = config.get('min_num_params', 0)
+ auto_wrap_policy = None
+
+ policies = []
+
+ from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
+
+ # Add lambda policy for LoRA modules if is_lora is True
+ if is_lora:
+
+ def lambda_policy_fn(module):
+ if (len(list(module.named_children())) == 0 and getattr(module, "weight", None) is not None and
+ module.weight.requires_grad):
+ return True
+ return False
+
+ lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
+ policies.append(lambda_policy)
+
+ if min_num_params > 0:
+ size_policy = functools.partial(size_based_auto_wrap_policy, min_num_params=min_num_params)
+ policies.append(size_policy)
+ elif fsdp_transformer_layer_cls_to_wrap is not None:
+ transformer_cls_to_wrap = set()
+ for layer_class in fsdp_transformer_layer_cls_to_wrap:
+ transformer_cls = get_module_class_from_name(module, layer_class)
+ if transformer_cls is None:
+ raise Exception("Could not find the transformer layer class to wrap in the model.")
+ else:
+ transformer_cls_to_wrap.add(transformer_cls)
+
+ transformer_policy = functools.partial(
+ transformer_auto_wrap_policy,
+ transformer_layer_cls=transformer_cls_to_wrap,
+ )
+ policies.append(transformer_policy)
+
+ if len(policies) > 0:
+ auto_wrap_policy = functools.partial(_or_policy, policies=policies)
+
+ return auto_wrap_policy
+
+
+def offload_fsdp_grad(module):
+ for _, param in module.named_parameters():
+ if param.grad is not None:
+ param.grad = param.grad.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_fsdp_grad(module, device_id):
+ for _, param in module.named_parameters():
+ if param.grad is not None:
+ param.grad = param.grad.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def offload_fsdp_param_and_grad(module, offload_grad=False):
+ for _, param in module.named_parameters():
+ if hasattr(param, "_local_shard"):
+ param._local_shard = param._local_shard.to("cpu", non_blocking=True)
+ param.data = param.data.to('cpu', non_blocking=True)
+ if offload_grad and param.grad is not None:
+ param.grad = param.grad.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_fsdp_param_and_grad(module, device_id, load_grad=False):
+ for _, param in module.named_parameters():
+ if hasattr(param, "_local_shard"):
+ param._local_shard = param._local_shard.to(device_id, non_blocking=True)
+ param.data = param.data.to(device_id, non_blocking=True)
+ if load_grad and param.grad is not None:
+ param.grad = param.grad.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def offload_fsdp_optimizer(optimizer):
+ for param_group in optimizer.param_groups:
+ for param in param_group['params']:
+ state = optimizer.state[param]
+ for key, value in state.items():
+ if isinstance(value, torch.Tensor):
+ state[key] = value.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_fsdp_optimizer(optimizer, device_id):
+ for param_group in optimizer.param_groups:
+ for param in param_group['params']:
+ state = optimizer.state[param]
+ for key, value in state.items():
+ if isinstance(value, torch.Tensor):
+ state[key] = value.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+@contextmanager
+def meta_device_init():
+ """
+ Create model parameters with meta device.
+
+ Note buffers in model will still be initialized in default device (e.g., CPU),
+ since the buffers can be non-persistent and filled with expected values that can
+ NOT be captured in meta device.
+ """
+ device = torch.device("meta")
+ old_register_parameter = nn.Module.register_parameter
+ registered = set()
+
+ def register_empty_parameter(module, name, param):
+ old_register_parameter(module, name, param)
+ # we will skip register shared parameters as it
+ # is already registered previously
+ if param is not None and param not in registered:
+ param_cls = type(module._parameters[name])
+ kwargs = module._parameters[name].__dict__
+ kwargs["requires_grad"] = param.requires_grad
+ module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+ registered.add(module._parameters[name])
+
+ try:
+ nn.Module.register_parameter = register_empty_parameter
+ yield
+ finally:
+ registered.clear()
+ nn.Module.register_parameter = old_register_parameter
+
+
+def parallel_load_safetensors(filepath):
+ """
+ Parallel load safetensors from huggingface checkpoint
+
+ Huggingface checkpoint contains:
+
+ - config.json: a json file for model configuration
+ - model.safetensor.index.json: a json file for safetensors (parameters & buffers) index
+ - model-000x-of-ooxx.safetensors: a binary file for safetensors (parameters & buffers) chunks
+
+ Or (when model is small),
+
+ - model.safetensors: a binary file for all parameters and buffers
+
+ Each rank will own a part of model chunks and load them directly into GPU memory.
+ """
+ from safetensors.torch import load_file
+
+ safetensors2param = {}
+
+ index_file = os.path.join(filepath, "model.safetensors.index.json")
+ if os.path.exists(index_file):
+ index = json.load(open(index_file, "rb"))
+ for param_name, filename in index["weight_map"].items():
+ safetensors2param.setdefault(filename, []).append(param_name)
+ else:
+ # in this case, the model is small and we can load it all at once
+ param_file = os.path.join(filepath, "model.safetensors")
+ assert os.path.exists(param_file), f"Cannot find {param_file}"
+ states = load_file(param_file)
+ for param_name in states:
+ safetensors2param.setdefault("model.safetensors", []).append(param_name)
+ del states
+
+ total_files = len(safetensors2param)
+ ckpt_chunks = sorted(safetensors2param.keys())
+ world_size = dist.get_world_size()
+ size = int(math.ceil(total_files / world_size))
+ ckpt_chunks = [ckpt_chunks[rank * size:rank * size + size] for rank in range(world_size)]
+
+ shard_states = {}
+ device = torch.cuda.current_device()
+ for rank, files in enumerate(ckpt_chunks):
+ if rank == dist.get_rank():
+ for file in files:
+ file = os.path.join(filepath, file)
+ states = load_file(file, device=device)
+ # print(f"rank {rank} loading {file}...")
+ shard_states.update(states)
+ else:
+ for file in files:
+ for param_name in safetensors2param[file]:
+ shard_states[param_name] = rank
+ return shard_states
+
+
+def parallel_init_module_fn(module: torch.nn.Module, shard_states: Dict[str, torch.nn.Parameter]):
+ """
+ Generate a function to initialize sub-modules in the `module` with `shard_states`
+ from huggingface checkpoint.
+
+ Args:
+ module (torch.nn.Module): the global module to be initialized
+ shard_states (Dict[str, torch.nn.Parameter]): the shard states from huggingface checkpoint
+
+ Returns:
+ init_fn (Callable): a function to initialize sub-modules in the `module` with `shard_states`
+ """
+
+ state2fqn = {}
+ for name, state in itertools.chain(module.named_parameters(remove_duplicate=False),
+ module.named_buffers(remove_duplicate=False)):
+ state2fqn.setdefault(state, []).append(name)
+ # remove standalone parameters and buffers
+ shared = {s for s, names in state2fqn.items() if len(names) > 1}
+ materialized_states = {}
+
+ @torch.no_grad()
+ def create_and_sync_state(param_name, state, is_param):
+ assert param_name in shard_states, f"{param_name} not loaded"
+ device = torch.cuda.current_device()
+ if is_param:
+ param = torch.nn.Parameter(torch.empty_like(state.data, device=device), requires_grad=state.requires_grad)
+ else: # buffer
+ param = torch.empty_like(state.data, device=device)
+ loaded = shard_states[param_name]
+ if isinstance(loaded, (torch.nn.Parameter, torch.Tensor)):
+ # NOTE: loaded.dtype can be different with param.dtype
+ param.data.copy_(loaded.data)
+ dist.broadcast(param.data, src=dist.get_rank())
+ else:
+ assert isinstance(loaded, int) # the rank that holds the state
+ dist.broadcast(param.data, src=loaded)
+ shard_states.pop(param_name)
+ del loaded
+ return param
+
+ def init_fn(sub_mod: torch.nn.Module, recurse: bool = True):
+ param_and_buffers = tuple(sub_mod.named_parameters(recurse=False)) + tuple(sub_mod.named_buffers(recurse=False))
+ # param_and_buffers = sorted(sub_mod.named_parameters(recurse=False), key=lambda x: x[0])
+ for name, state in param_and_buffers:
+ if not state.is_meta:
+ continue
+ is_param = name in sub_mod._parameters
+ fqn = state2fqn[state].pop(0)
+ # non-persistent buffers will not be saved in state dict, we can safely skip it
+ if (not is_param) and fqn not in shard_states:
+ if state.is_meta:
+ raise RuntimeError(
+ f"find a non-persistent buffer ({fqn}) initiated with device meta. "
+ "Such buffer is not saved in checkpoint and user should guarantee to init in CPU / GPU device.")
+ continue
+ # for shared parameter, we get it from the first time it is created
+ if state in shared:
+ if state not in materialized_states:
+ materialized_states[state] = create_and_sync_state(fqn, state, is_param)
+ else:
+ if fqn in shard_states:
+ shard_states.pop(fqn)
+ materialize_state = materialized_states[state]
+ # for not shared parameter, we create it directly
+ else:
+ materialize_state = create_and_sync_state(fqn, state, is_param)
+ if is_param:
+ sub_mod._parameters[name] = materialize_state
+ else:
+ sub_mod._buffers[name] = materialize_state
+ if recurse:
+ for module in sub_mod.children():
+ init_fn(module, recurse=True)
+
+ # for debug
+ # if len(shard_states) == 0: print("clear")
+ return sub_mod
+
+ return init_fn
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py b/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c4ecb9a5956865ce35651d6eaaf6844ba87f41
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/hdfs_io.py
@@ -0,0 +1,144 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import logging
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_SFT_LOGGING_LEVEL', 'WARN'))
+
+_HDFS_PREFIX = "hdfs://"
+
+_HDFS_BIN_PATH = shutil.which('hdfs')
+
+
+def exists(path: str, **kwargs) -> bool:
+ r"""Works like os.path.exists() but supports hdfs.
+
+ Test whether a path exists. Returns False for broken symbolic links.
+
+ Args:
+ path (str): path to test
+
+ Returns:
+ bool: True if the path exists, False otherwise
+ """
+ if _is_non_local(path):
+ return _exists(path, **kwargs)
+ return os.path.exists(path)
+
+
+def _exists(file_path: str):
+ """ hdfs capable to check whether a file_path is exists """
+ if file_path.startswith("hdfs"):
+ return _run_cmd(_hdfs_cmd(f"-test -e {file_path}")) == 0
+ return os.path.exists(file_path)
+
+
+def makedirs(name, mode=0o777, exist_ok=False, **kwargs) -> None:
+ r"""Works like os.makedirs() but supports hdfs.
+
+ Super-mkdir; create a leaf directory and all intermediate ones. Works like
+ mkdir, except that any intermediate path segment (not just the rightmost)
+ will be created if it does not exist. If the target directory already
+ exists, raise an OSError if exist_ok is False. Otherwise no exception is
+ raised. This is recursive.
+
+ Args:
+ name (str): directory to create
+ mode (int): file mode bits
+ exist_ok (bool): if True, do not raise an exception if the directory already exists
+ kwargs: keyword arguments for hdfs
+
+ """
+ if _is_non_local(name):
+ # TODO(haibin.lin):
+ # - handle OSError for hdfs(?)
+ # - support exist_ok for hdfs(?)
+ _mkdir(name, **kwargs)
+ else:
+ os.makedirs(name, mode=mode, exist_ok=exist_ok)
+
+
+def _mkdir(file_path: str) -> bool:
+ """hdfs mkdir"""
+ if file_path.startswith("hdfs"):
+ _run_cmd(_hdfs_cmd(f"-mkdir -p {file_path}"))
+ else:
+ os.makedirs(file_path, exist_ok=True)
+ return True
+
+
+def copy(src: str, dst: str, **kwargs) -> bool:
+ r"""Works like shutil.copy() for file, and shutil.copytree for dir, and supports hdfs.
+
+ Copy data and mode bits ("cp src dst"). Return the file's destination.
+ The destination may be a directory.
+ If source and destination are the same file, a SameFileError will be
+ raised.
+
+ Arg:
+ src (str): source file path
+ dst (str): destination file path
+ kwargs: keyword arguments for hdfs copy
+
+ Returns:
+ str: destination file path
+
+ """
+ if _is_non_local(src) or _is_non_local(dst):
+ # TODO(haibin.lin):
+ # - handle SameFileError for hdfs files(?)
+ # - return file destination for hdfs files
+ return _copy(src, dst)
+ else:
+ if os.path.isdir(src):
+ return shutil.copytree(src, dst, **kwargs)
+ else:
+ return shutil.copy(src, dst, **kwargs)
+
+
+def _copy(from_path: str, to_path: str, timeout: int = None) -> bool:
+ if to_path.startswith("hdfs"):
+ if from_path.startswith("hdfs"):
+ returncode = _run_cmd(_hdfs_cmd(f"-cp -f {from_path} {to_path}"), timeout=timeout)
+ else:
+ returncode = _run_cmd(_hdfs_cmd(f"-put -f {from_path} {to_path}"), timeout=timeout)
+ else:
+ if from_path.startswith("hdfs"):
+ returncode = _run_cmd(_hdfs_cmd(f"-get \
+ {from_path} {to_path}"), timeout=timeout)
+ else:
+ try:
+ shutil.copy(from_path, to_path)
+ returncode = 0
+ except shutil.SameFileError:
+ returncode = 0
+ except Exception as e:
+ logger.warning(f"copy {from_path} {to_path} failed: {e}")
+ returncode = -1
+ return returncode == 0
+
+
+def _run_cmd(cmd: str, timeout=None):
+ return os.system(cmd)
+
+
+def _hdfs_cmd(cmd: str) -> str:
+ return f"{_HDFS_BIN_PATH} dfs {cmd}"
+
+
+def _is_non_local(path: str):
+ return path.startswith(_HDFS_PREFIX)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/import_utils.py b/code/RL_model/verl/Search-R1/verl/utils/import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5690512d144a30d2a1f0bd128a40eb8876936b7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/import_utils.py
@@ -0,0 +1,48 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities to check if packages are available.
+We assume package availability won't change during runtime.
+"""
+
+from functools import cache
+from typing import List
+
+
+@cache
+def is_megatron_core_available():
+ try:
+ from megatron.core import parallel_state as mpu
+ return True
+ except ImportError:
+ return False
+
+
+@cache
+def is_vllm_available():
+ try:
+ import vllm
+ return True
+ except ImportError:
+ return False
+
+
+def import_external_libs(external_libs=None):
+ if external_libs is None:
+ return
+ if not isinstance(external_libs, List):
+ external_libs = [external_libs]
+ import importlib
+ for external_lib in external_libs:
+ importlib.import_module(external_lib)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/logger/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/logger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/logger/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/utils/logger/aggregate_logger.py b/code/RL_model/verl/Search-R1/verl/utils/logger/aggregate_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac57cf58e3de2862b5443189ccec276a7d2fc283
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/logger/aggregate_logger.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A Ray logger will receive logging info from different processes.
+"""
+import numbers
+from typing import Dict
+
+
+def concat_dict_to_str(dict: Dict, step):
+ output = [f'step:{step}']
+ for k, v in dict.items():
+ if isinstance(v, numbers.Number):
+ output.append(f'{k}:{v:.3f}')
+ output_str = ' - '.join(output)
+ return output_str
+
+
+class LocalLogger:
+
+ def __init__(self, remote_logger=None, enable_wandb=False, print_to_console=False):
+ self.print_to_console = print_to_console
+ if print_to_console:
+ print('Using LocalLogger is deprecated. The constructor API will change ')
+
+ def flush(self):
+ pass
+
+ def log(self, data, step):
+ if self.print_to_console:
+ print(concat_dict_to_str(data, step=step), flush=True)
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py b/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf6e1f0fa70784edb6a7e6efecdba07f0c399b3
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/logging_utils.py
@@ -0,0 +1,22 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+
+def set_basic_config(level):
+ """
+ This function sets the global logging format and level. It will be called when import verl
+ """
+ logging.basicConfig(format='%(levelname)s:%(asctime)s:%(message)s', level=level)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/megatron/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron/memory.py b/code/RL_model/verl/Search-R1/verl/utils/megatron/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e8570ed495d83e74a1d0c7b1d17181271ce92a6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron/memory.py
@@ -0,0 +1,41 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+class MemoryBuffer:
+
+ def __init__(self, numel, numel_padded, dtype):
+ self.numel = numel
+ self.numel_padded = numel_padded
+ self.dtype = dtype
+ self.data = torch.zeros(self.numel_padded,
+ dtype=self.dtype,
+ device=torch.cuda.current_device(),
+ requires_grad=False)
+
+ def zero(self):
+ """Reset the buffer to zero."""
+ self.data.zero_()
+
+ def get(self, shape, start_index):
+ """Return a tensor with the input `shape` as a view into the
+ 1-D data starting at `start_index`."""
+ end_index = start_index + shape.numel()
+ assert end_index <= self.numel, \
+ 'requested tensor is out of the buffer range.'
+ buffer_tensor = self.data[start_index:end_index]
+ buffer_tensor = buffer_tensor.view(shape)
+ return buffer_tensor
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron/optimizer.py b/code/RL_model/verl/Search-R1/verl/utils/megatron/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae70b0876d5255ffd24e132ca4d60faab883582
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron/optimizer.py
@@ -0,0 +1,92 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from apex.optimizers import FusedAdam as Adam
+from apex.optimizers import FusedSGD as SGD
+from megatron.optimizer.distrib_optimizer import DistributedOptimizer
+from megatron.optimizer.grad_scaler import ConstantGradScaler, DynamicGradScaler
+from megatron.optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
+from megatron.optimizer import get_param_groups
+
+from verl.utils.megatron.optimizer_config import OptimizerConfig
+
+
+def get_megatron_optimizer(
+ model,
+ config: OptimizerConfig,
+ no_weight_decay_cond=None,
+ scale_lr_cond=None,
+ lr_mult=1.0,
+ check_for_nan_in_loss_and_grad=False,
+ overlap_param_gather=False # add for verl
+):
+ # Base optimizer.
+ param_groups = get_param_groups(model, no_weight_decay_cond, scale_lr_cond, lr_mult)
+
+ if config.optimizer == 'adam':
+ optimizer = Adam(param_groups,
+ lr=config.lr,
+ weight_decay=config.weight_decay,
+ betas=(config.adam_beta1, config.adam_beta2),
+ eps=config.adam_eps)
+ elif config.optimizer == 'sgd':
+ optimizer = SGD(param_groups, lr=config.lr, weight_decay=config.weight_decay, momentum=config.sgd_momentum)
+ else:
+ raise Exception('{} optimizer is not supported.'.format(config.optimizer))
+
+ # Determine whether the params have main-grad field.
+ params_have_main_grad = True
+
+ # Mixed precision optimizer.
+ # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
+ # from the MixedPrecisionOptimizer, which manages any optimizer where
+ # the model params and main params are distinct.
+ if config.fp16 or config.bf16 or config.use_distributed_optimizer:
+
+ # Grad scaler:
+ # if loss-scale is provided, instantiate the constant scaler.
+ # if we are using fp16 and loss-scale is not present, use a
+ # dynamic scaler.
+ # otherwise we are running in bf16 with no loss-scale so
+ # leave it as None.
+ grad_scaler = None
+
+ # Constant loss scale.
+ if config.loss_scale:
+ grad_scaler = ConstantGradScaler(config.loss_scale)
+
+ # Dynamic loss scale.
+ else:
+ if config.fp16:
+ grad_scaler = DynamicGradScaler(initial_scale=config.initial_loss_scale,
+ min_scale=config.min_loss_scale,
+ growth_factor=2.0,
+ backoff_factor=0.5,
+ growth_interval=config.loss_scale_window,
+ hysteresis=config.hysteresis)
+
+ # Megatron optimizer.
+ if config.use_distributed_optimizer:
+ return DistributedOptimizer(optimizer, config.clip_grad, config.log_num_zeros_in_grad,
+ check_for_nan_in_loss_and_grad, params_have_main_grad, config.fp16, config.bf16,
+ config.params_dtype, grad_scaler, model, overlap_param_gather)
+ else:
+ return Float16OptimizerWithFloat16Params(optimizer, config.clip_grad, config.log_num_zeros_in_grad,
+ check_for_nan_in_loss_and_grad, params_have_main_grad, config.fp16,
+ config.bf16, config.params_dtype, grad_scaler, model)
+
+ # FP32.
+ return FP32Optimizer(optimizer, config.clip_grad, config.log_num_zeros_in_grad, check_for_nan_in_loss_and_grad,
+ params_have_main_grad, model)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron/optimizer_config.py b/code/RL_model/verl/Search-R1/verl/utils/megatron/optimizer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3401de4163aadcad7f3a586587da38989ee19d3d
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron/optimizer_config.py
@@ -0,0 +1,129 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+import torch
+
+
+@dataclass
+class OptimizerConfig:
+ """Configuration for optimizer."""
+
+ ##############
+ # General
+ ##############
+ optimizer: str = 'adam'
+ """Optimizer to use (one of Adam or SGD)."""
+
+ lr: Optional[float] = None
+ """Initial learning rate. Depending on decay style and initial warmup, the learning rate at each
+ iteration would be different.
+ """
+
+ min_lr: Optional[float] = None
+ """Minumum value for learning rate. The scheduler clip values below this threshold."""
+
+ decoupled_lr: Optional[float] = None
+ """Separate learning rate for the input and output layer."""
+
+ decoupled_min_lr: Optional[float] = None
+ """Minimum value for learning rate for the input and output layer. The scheduler clip values
+ below this threshold.
+ """
+
+ weight_decay: float = 0.01
+ """Weight decay coefficient for L2 regularization."""
+
+ ##############
+ # Precision
+ ##############
+ fp16: bool = False
+ """If true, train with fp16 mixed precision training. Defaults to False."""
+
+ bf16: bool = False
+ """If true, train with bf16 mixed precision training. Defaults to False."""
+
+ params_dtype: torch.dtype = torch.float32
+ """dtype used when intializing the weights. Defaults to torch.float32."""
+
+ ###############
+ # Loss scaling
+ ###############
+ loss_scale: Optional[float] = None
+ """Static loss scaling, positive power of 2 values can improve fp16 convergence. If None,
+ dynamic loss scaling is used.
+ """
+
+ initial_loss_scale: float = 2**32
+ """Initial loss-scale for dynamic loss scaling."""
+
+ min_loss_scale: float = 1.0
+ """Minimum loss scale for dynamic loss scaling."""
+
+ loss_scale_window: float = 1000
+ """Window over which to raise/lower dynamic scale."""
+
+ hysteresis: int = 2
+ """Hysteresis for dynamic loss scaling."""
+
+ ##############
+ # Optimizer
+ ##############
+ # Adam
+ adam_beta1: float = 0.9
+ """First coefficient for computing running averages of gradient and its square in Adam
+ optimizer.
+ """
+
+ adam_beta2: float = 0.999
+ """Second coefficient for computing running averages of gradient and its square in Adam
+ optimizer.
+ """
+
+ adam_eps: float = 1e-08
+ """Term added to the denominator to improve numerical stability in Adam optimizer."""
+
+ # SGD.
+ sgd_momentum: float = 0.9
+ """Momentum factor for SGD optimizer."""
+
+ #######################
+ # Distributed optimizer
+ #######################
+ use_distributed_optimizer: bool = False
+ """Distribute optimizer state over data-parallel replicas."""
+
+ overlap_grad_reduce: bool = False
+ """If true, overlap grad reduce-scatter with backward compute in distributed optimizer."""
+
+ overlap_param_gather: bool = False
+ """If true, overlap param all-gather with forward compute in distributed optimizer."""
+
+ ################
+ # Miscellaneous
+ ################
+ clip_grad: float = 1.0
+ """Gradient clipping based on global L2 norm."""
+
+ log_num_zeros_in_grad: bool = False
+ """If true, calculate and log the number of zeros in gradient."""
+
+ barrier_with_L1_time: bool = False
+ """If true, use barrier with level 1 time measurements."""
+
+ timers: Callable = None
+ """Function to get timers."""
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron/pipeline_parallel.py b/code/RL_model/verl/Search-R1/verl/utils/megatron/pipeline_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a3790bb1a0fe0340390b7c9083f94d9d56b9383
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron/pipeline_parallel.py
@@ -0,0 +1,51 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from megatron.core import parallel_state as mpu
+
+from .sequence_parallel import pad_to_sequence_parallel
+
+
+def compute_transformers_input_shapes(batches, meta_info):
+ from flash_attn.bert_padding import unpad_input # flash 2 is a must for Megatron
+ # pre-compute input shapes for each micro-batch at each pp stage
+ input_shapes = []
+ for model_inputs in batches:
+ input_ids = model_inputs['input_ids']
+ attention_mask = model_inputs['attention_mask']
+ input_ids_rmpad = unpad_input(input_ids.unsqueeze(dim=-1), attention_mask)[0] # (total_nnz, 1)
+ if meta_info['sequence_parallel']:
+ input_ids_rmpad = pad_to_sequence_parallel(input_ids_rmpad)
+ # compute shapes for model_inputs
+ input_shapes.append(
+ torch.Size([
+ input_ids_rmpad.shape[0] // mpu.get_tensor_model_parallel_world_size(), 1, meta_info['hidden_size']
+ ]))
+ else:
+ # compute shapes for model_inputs
+ input_shapes.append(torch.Size([input_ids_rmpad.shape[0], 1, meta_info['hidden_size']]))
+ return input_shapes
+
+
+def make_batch_generator(batches, vpp_size):
+ if vpp_size > 1:
+ # has vpp
+ batch_generator = [batches] * vpp_size # number of vpp chunks
+ batch_generator = [iter(b) for b in batch_generator]
+ else:
+ # no vpp
+ batch_generator = iter(batches)
+ return batch_generator
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron/sequence_parallel.py b/code/RL_model/verl/Search-R1/verl/utils/megatron/sequence_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b76cb295ef681e30b22d45404d4d5c26493f051
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron/sequence_parallel.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+from megatron.core import parallel_state as mpu
+
+
+def mark_parameter_as_sequence_parallel(parameter):
+ setattr(parameter, 'sequence_parallel', True)
+
+
+def is_sequence_parallel_param(param):
+ return hasattr(param, 'sequence_parallel') and param.sequence_parallel
+
+
+def pad_to_sequence_parallel(unpad_tokens: torch.Tensor):
+ """pad the tokens such that the total length is a multiple of sp world size
+
+ Args:
+ unpad_tokens: (total_nnz, ...). Tokens after removing padding
+
+ Returns:
+
+ """
+ total_nnz = unpad_tokens.shape[0]
+ sp_world_size = mpu.get_tensor_model_parallel_world_size()
+
+ if total_nnz % sp_world_size == 0:
+ pad_size = 0
+ else:
+ pad_size = sp_world_size - total_nnz % sp_world_size
+
+ if pad_size > 0:
+ if unpad_tokens.ndim == 1:
+ unpad_tokens = F.pad(unpad_tokens, (0, pad_size))
+ elif unpad_tokens.ndim == 2:
+ unpad_tokens = F.pad(unpad_tokens, (0, 0, 0, pad_size))
+ else:
+ raise NotImplementedError(f'Padding dim {unpad_tokens.ndim()} is not supported')
+
+ return unpad_tokens
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron/tensor_parallel.py b/code/RL_model/verl/Search-R1/verl/utils/megatron/tensor_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..25a8ce422c42498a5e5cbdddc74d6c9f3ae8d06b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron/tensor_parallel.py
@@ -0,0 +1,184 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for using tensor_parallel in megatron
+"""
+from typing import Dict
+import torch
+from torch.nn import init
+import torch.distributed as dist
+from megatron.core import ModelParallelConfig
+from megatron.core import parallel_state as mpu, tensor_parallel
+import verl.utils.torch_functional as verl_F
+
+
+def update_kwargs_with_config(dictionary: Dict, config: ModelParallelConfig):
+ dictionary['config'] = config
+ return dictionary
+
+
+def get_default_kwargs_for_model_parallel_config():
+ model_parallel_config_kwargs = {
+ 'params_dtype': torch.float32,
+ 'use_cpu_initialization': False,
+ 'perform_initialization': True,
+ 'gradient_accumulation_fusion': False,
+ 'sequence_parallel': False,
+ }
+ return model_parallel_config_kwargs
+
+
+def get_default_model_parallel_config():
+ return ModelParallelConfig(**get_default_kwargs_for_model_parallel_config())
+
+
+def get_common_default_kwargs_for_parallel_linear():
+ default_model_parallel_config = get_default_model_parallel_config()
+ common_default_kwargs = {
+ 'init_method': init.xavier_normal_,
+ 'stride': 1,
+ 'keep_master_weight_for_test': False,
+ 'config': default_model_parallel_config,
+ }
+ return common_default_kwargs
+
+
+def get_default_kwargs_for_column_parallel_linear():
+ model_parallel_config_kwargs = get_default_kwargs_for_model_parallel_config()
+ column_parallel_config_kwargs = {
+ 'async_tensor_model_parallel_allreduce': False,
+ }
+ model_parallel_config_kwargs.update(column_parallel_config_kwargs)
+ column_default_kwargs = {
+ 'config': ModelParallelConfig(**model_parallel_config_kwargs),
+ }
+ common_default_kwargs = get_common_default_kwargs_for_parallel_linear()
+ common_default_kwargs.update(column_default_kwargs)
+ return common_default_kwargs
+
+
+def get_default_kwargs_for_row_parallel_linear():
+ common_default_kwargs = get_common_default_kwargs_for_parallel_linear()
+ return common_default_kwargs
+
+
+def get_default_kwargs_for_parallel_embedding():
+ model_parallel_config_kwargs = get_default_kwargs_for_model_parallel_config()
+ embedding_default_kwargs = {
+ 'init_method': init.xavier_normal_,
+ 'config': ModelParallelConfig(**model_parallel_config_kwargs),
+ }
+ return embedding_default_kwargs
+
+
+def is_tensor_parallel_param(param):
+ return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel)
+
+
+def get_tensor_parallel_partition_dim(param):
+ assert is_tensor_parallel_param(param)
+ return param.partition_dim
+
+
+def get_tensor_parallel_partition_stride(param):
+ assert is_tensor_parallel_param(param)
+ return param.partition_stride
+
+
+class _VocabParallelEntropy(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx, vocab_parallel_logits: torch.Tensor) -> torch.Tensor:
+ logits_max = vocab_parallel_logits.max(dim=-1, keepdim=True).values
+ dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=mpu.get_tensor_model_parallel_group())
+ normalized_vocab_parallel_logits = vocab_parallel_logits - logits_max
+ normalized_exp_logits = normalized_vocab_parallel_logits.exp()
+ normalized_sum_exp_logits = normalized_exp_logits.sum(dim=-1, keepdim=True)
+ dist.all_reduce(normalized_sum_exp_logits, group=mpu.get_tensor_model_parallel_group())
+ softmax_logits = normalized_exp_logits / normalized_sum_exp_logits
+ sum_softmax_times_logits = (softmax_logits * vocab_parallel_logits).sum(dim=-1, keepdim=True)
+ dist.all_reduce(sum_softmax_times_logits, group=mpu.get_tensor_model_parallel_group())
+ entropy = logits_max + normalized_sum_exp_logits.log() - sum_softmax_times_logits
+ ctx.save_for_backward(vocab_parallel_logits, softmax_logits, sum_softmax_times_logits)
+ return entropy.squeeze(dim=-1)
+
+ @staticmethod
+ def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+ vocab_parallel_logits, softmax_logits, sum_softmax_times_logits = ctx.saved_tensors
+ grad_input = grad_output.unsqueeze(dim=-1) * softmax_logits * (sum_softmax_times_logits - vocab_parallel_logits)
+ return grad_input
+
+
+def vocab_parallel_entropy(vocab_parallel_logits: torch.Tensor) -> torch.Tensor:
+ """Compute entropy when the logits are sharded in tp ranks
+
+ Args:
+ vocab_parallel_logits: (total_nnz, vocab_size // tp_size)
+
+ Returns: (total_nnz,)
+
+ """
+ return _VocabParallelEntropy.apply(vocab_parallel_logits)
+
+
+def vocab_parallel_log_probs_from_logits(logits, labels):
+ """TODO(zhangchi.usc1992): We may change the implementation later"""
+ return -tensor_parallel.vocab_parallel_cross_entropy(vocab_parallel_logits=logits, target=labels)
+
+
+def vocab_parallel_log_probs_from_logits_response_rmpad(input_ids, attention_mask, logits_rmpad, response_length):
+ """Similar to log_probs_from_logits_response_rmpad, but the logits_rmpad is now spliited across tensor parallel region.
+ This will further reduce the peak memory usage during training
+
+ Args:
+ input_ids: [batch_size, seqlen]
+ attention_mask: [batch_size, seqlen]
+ logits_rmpad: [total_nnz, vocab_size // tp_size]
+ response_length: int
+
+ """
+ from flash_attn.bert_padding import pad_input, unpad_input
+
+ batch_size, seqlen = input_ids.shape
+ input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask=attention_mask)
+ input_ids_rmpad = input_ids_rmpad.squeeze(-1)
+ input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=0)
+ full_log_probs_rmpad = vocab_parallel_log_probs_from_logits(logits=logits_rmpad,
+ labels=input_ids_rmpad_rolled) # (total_nnz,)
+ full_output = pad_input(hidden_states=full_log_probs_rmpad.unsqueeze(-1),
+ indices=indices,
+ batch=batch_size,
+ seqlen=seqlen)
+ output = full_output.squeeze(-1)[:, -response_length - 1:-1] # [batch_size, response_length]
+ return output
+
+
+def vocab_parallel_compute_entropy_loss(logits, eos_mask):
+ """Compute Categorical entropy loss
+
+ Args:
+ logits: `(torch.Tensor)`
+ shape: (bs, response_length, vocab_size)
+ eos_mask: `(torch.Tensor)`
+ shape: (bs, response_length)
+
+ Returns:
+ entropy: a scalar torch.Tensor
+
+ """
+ # compute entropy
+ entropy = vocab_parallel_entropy(logits)
+ entropy_loss = verl_F.masked_mean(entropy, mask=eos_mask)
+ return entropy_loss
diff --git a/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py b/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb6b65a79ea302e3f7eaccd5145e29adbb9edd6
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/megatron_utils.py
@@ -0,0 +1,253 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pretrain utilities."""
+from typing import Any, Dict
+import time
+from omegaconf import DictConfig
+from verl.utils.torch_dtypes import PrecisionType
+from verl.utils.memory_buffer import build_memory_reference_from_module
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from megatron.core import mpu, tensor_parallel
+from megatron.core.utils import get_model_config
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.module import Float16Module
+# from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.distributed import DistributedDataParallel as DDP
+from megatron.core.enums import ModelType
+
+
+def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
+ """Build the model."""
+ # Build model.
+ if mpu.get_pipeline_model_parallel_world_size() > 1 and \
+ mpu.get_virtual_pipeline_model_parallel_world_size() is not None:
+ assert model_type != ModelType.encoder_and_decoder, \
+ "Interleaved schedule not supported for model with both encoder and decoder"
+ model = []
+ for i in range(mpu.get_virtual_pipeline_model_parallel_world_size()):
+ mpu.set_virtual_pipeline_model_parallel_rank(i)
+ # Set pre_process and post_process only after virtual rank is set.
+ pre_process = mpu.is_pipeline_first_stage()
+ post_process = mpu.is_pipeline_last_stage()
+ this_model = model_provider_func(pre_process=pre_process, post_process=post_process)
+ this_model.model_type = model_type
+ model.append(this_model)
+ else:
+ pre_process = mpu.is_pipeline_first_stage()
+ post_process = mpu.is_pipeline_last_stage()
+ add_encoder = True
+ add_decoder = True
+ if model_type == ModelType.encoder_and_decoder:
+ if mpu.get_pipeline_model_parallel_world_size() > 1:
+ assert mpu.get_pipeline_model_parallel_split_rank() is not None, \
+ "Split rank needs to be specified for model with both encoder and decoder"
+ rank = mpu.get_pipeline_model_parallel_rank()
+ split_rank = mpu.get_pipeline_model_parallel_split_rank()
+ world_size = mpu.get_pipeline_model_parallel_world_size()
+ pre_process = rank == 0 or rank == split_rank
+ post_process = (rank == (split_rank - 1)) or (rank == (world_size - 1))
+ add_encoder = mpu.is_pipeline_stage_before_split()
+ add_decoder = mpu.is_pipeline_stage_after_split()
+ model = model_provider_func(pre_process=pre_process,
+ post_process=post_process,
+ add_encoder=add_encoder,
+ add_decoder=add_decoder)
+ else:
+ model = model_provider_func(pre_process=pre_process, post_process=post_process)
+ model.model_type = model_type
+
+ if not isinstance(model, list):
+ model = [model]
+
+ # Set tensor model parallel attributes if not set.
+ # Only parameters that are already tensor model parallel have these
+ # attributes set for them. We should make sure the default attributes
+ # are set for all params so the optimizer can use them.
+ for model_module in model:
+ for param in model_module.parameters():
+ tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+
+ # Print number of parameters.
+ if mpu.get_data_parallel_rank() == 0:
+ print(' > number of parameters on (tensor, pipeline) '
+ 'model parallel rank ({}, {}): {}'.format(
+ mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(),
+ sum([sum([p.nelement() for p in model_module.parameters()]) for model_module in model])),
+ flush=True)
+
+ # GPU allocation.
+ for model_module in model:
+ model_module.cuda(torch.cuda.current_device())
+
+ # Fp16 conversion.
+ config = get_model_config(model[0])
+ if config.fp16 or config.bf16: # the ModelParallelConfig in GPTModel
+ model = [Float16Module(config, model_module) for model_module in model]
+
+ if wrap_with_ddp:
+ model = [
+ DDP(config=config,
+ module=model_chunk,
+ data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True),
+ accumulate_allreduce_grads_in_fp32=True,
+ overlap_grad_reduce=False,
+ use_distributed_optimizer=True,
+ disable_bucketing=(model_chunk_idx > 0)) for (model_chunk_idx, model_chunk) in enumerate(model)
+ ]
+ # # Broadcast params from data parallel src rank to other data parallel ranks.
+ # if args.data_parallel_random_init:
+ for model_module in model:
+ model_module.broadcast_params()
+ return model
+
+
+ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module)
+
+
+def unwrap_model(model, module_instances=ALL_MODULE_WRAPPER_CLASSNAMES):
+ return_list = True
+ if not isinstance(model, list):
+ model = [model]
+ return_list = False
+ unwrapped_model = []
+ for model_module in model:
+ while isinstance(model_module, module_instances):
+ model_module = model_module.module
+ unwrapped_model.append(model_module)
+ if not return_list:
+ return unwrapped_model[0]
+ return unwrapped_model
+
+
+from transformers import PretrainedConfig
+
+
+def convert_config(hf_config: PretrainedConfig, megatron_config) -> TransformerConfig:
+ print(f'megatron config {megatron_config}')
+ dt = PrecisionType.to_dtype(megatron_config['param_dtype'])
+ print(f'pipeline_dtype=megatron_config {dt}')
+ transformer_config = TransformerConfig(
+ num_layers=hf_config.num_hidden_layers,
+ hidden_size=hf_config.hidden_size,
+ num_attention_heads=hf_config.num_attention_heads,
+ num_query_groups=hf_config.num_key_value_heads,
+ ffn_hidden_size=hf_config.intermediate_size,
+ # max_position_embeddings=hf_config.max_position_embeddings,
+ activation_func=F.silu,
+ normalization='RMSNorm',
+ # rotary_percent=False, # default,
+ gated_linear_unit=True, # for llama
+ use_cpu_initialization=True,
+ apply_residual_connection_post_layernorm=False, # check what's this mean
+ add_bias_linear=False,
+ tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
+ pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
+ virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(),
+ pipeline_dtype=PrecisionType.to_dtype(megatron_config['param_dtype']),
+ params_dtype=PrecisionType.to_dtype(megatron_config['param_dtype']),
+ sequence_parallel=megatron_config['sequence_parallel_enabled'],
+ variable_seq_lengths=True,
+ masked_softmax_fusion=True,
+ bf16=PrecisionType.to_dtype(megatron_config['param_dtype']) is torch.bfloat16)
+ if torch.distributed.get_rank() == 0:
+ print(f'tensor_parallel_size={transformer_config.tensor_model_parallel_size} \n \
+ pipeline_model_parallel_size={transformer_config.pipeline_model_parallel_size} \n \
+ virtual_pipeline_model_parallel_size={transformer_config.virtual_pipeline_model_parallel_size} \n \
+ pipeline_dtype={transformer_config.pipeline_dtype} \n \
+ params_dtype={transformer_config.params_dtype} \n \
+ sequence_parallel={transformer_config.sequence_parallel} \n \
+ variable_seq_lengths={transformer_config.variable_seq_lengths} \n \
+ masked_softmax_fusion={transformer_config.masked_softmax_fusion} \n ')
+
+ return transformer_config
+
+
+# from megatron.core.optimizer import OptimizerConfig
+
+from verl.utils.megatron.optimizer_config import OptimizerConfig
+
+
+def init_megatron_optim_config(optim_config: Dict) -> OptimizerConfig:
+ config = OptimizerConfig(
+ optimizer='adam',
+ lr=optim_config.get('lr'),
+ clip_grad=optim_config.get('clip_grad'),
+ weight_decay=1e-2,
+ bf16=True,
+ params_dtype=torch.bfloat16,
+ use_distributed_optimizer=True,
+ )
+ return config
+
+
+from megatron.core import ModelParallelConfig
+
+
+def init_model_parallel_config(config: DictConfig) -> ModelParallelConfig:
+ # TODO(sgm): check how to disable megatron timers
+ timers = FakeTimers()
+ return ModelParallelConfig(tensor_model_parallel_size=config.get('tensor_model_parallel_size'),
+ pipeline_model_parallel_size=config.get('pipeline_model_parallel_size'),
+ virtual_pipeline_model_parallel_size=config.get('virtual_pipeline_model_parallel_size'),
+ sequence_parallel=config.get('sequence_parallel'),
+ params_dtype=PrecisionType.to_dtype(config.get('param_dtype')),
+ pipeline_dtype=PrecisionType.to_dtype(config.get('param_dtype')),
+ bf16=True,
+ fp16=False,
+ timers=timers)
+
+
+class FakeTimers:
+ """Disable All Megatron Timing with FakeTimers"""
+
+ def __init__(self):
+ from megatron.timers import DummyTimer
+ self.dummy_timer = DummyTimer()
+
+ def __call__(self, *args: Any, **kwds: Any) -> Any:
+ return self.dummy_timer
+
+
+def offload_megatron_param_and_grad(module_list: nn.ModuleList, offload_grad=False, hybrid_engine=None):
+ if hybrid_engine is not None:
+ pp_rank = mpu.get_pipeline_model_parallel_rank()
+ for buffer in hybrid_engine.memory_buffers[pp_rank].values():
+ buffer.data = buffer.data.to('cpu', non_blocking=True)
+ build_memory_reference_from_module(module_list, hybrid_engine.memory_buffers[pp_rank], maintain_weight=True)
+ else:
+ for module in module_list:
+ for _, param in module.named_parameters():
+ param.data = param.data.to('cpu', non_blocking=True)
+ if offload_grad and param.grad is not None:
+ param.grad = param.grad.to("cpu", non_blocking=True)
+ torch.cuda.empty_cache()
+
+
+def load_megatron_param_and_grad(module_list: nn.ModuleList, device_id, load_grad=False, hybrid_engine=None):
+ if hybrid_engine is not None:
+ pp_rank = mpu.get_pipeline_model_parallel_rank()
+ for buffer in hybrid_engine.memory_buffers[pp_rank].values():
+ buffer.data = buffer.data.to(device_id, non_blocking=True)
+ build_memory_reference_from_module(module_list, hybrid_engine.memory_buffers[pp_rank], maintain_weight=True)
+ else:
+ for module in module_list:
+ for _, param in module.named_parameters():
+ param.data = param.data.to(device_id, non_blocking=True)
+ if load_grad and param.grad is not None:
+ param.grad = param.grad.to(device_id, non_blocking=True)
+ torch.cuda.empty_cache()
diff --git a/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py b/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e07e42f7bc4648d3376dba404ae122e07ccb0d0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/memory_buffer.py
@@ -0,0 +1,214 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains utilities to manipulate torch memory buffers
+"""
+
+from typing import Dict, List
+
+import torch
+from torch import nn
+
+
+class MemoryBuffer:
+ """
+ A memory buffer is a contiguous torch tensor that may combine multiple tensors sharing with the underlying
+ memory. It must have a unique type to support this behavior.
+ """
+
+ def __init__(self, numel: int, numel_padded: int, dtype: torch.dtype):
+ self.numel = numel
+ self.numel_padded = numel_padded
+ self.dtype = dtype
+ self.data = torch.zeros(self.numel_padded, dtype=self.dtype, device='cuda', requires_grad=False)
+
+ def zero(self):
+ """Reset the buffer to zero."""
+ self.data.zero_()
+
+ def get(self, shape, start_index):
+ """Return a tensor with the input `shape` as a view into the
+ 1-D data starting at `start_index`."""
+ end_index = start_index + shape.numel()
+ assert end_index <= self.numel, \
+ 'requested tensor is out of the buffer range.'
+ buffer_tensor = self.data[start_index:end_index]
+ buffer_tensor = buffer_tensor.view(shape)
+ return buffer_tensor
+
+
+def calc_padded_numel(shape: torch.Size, dtype: torch.dtype):
+ """for cuda memory alignment, make sure alignment by 128-bits"""
+ align_numel = 128 // torch.finfo(dtype).bits
+ numel = shape.numel()
+ return (numel + align_numel - 1) // align_numel * align_numel
+
+
+def get_weight_buffer_meta_from_module(module: nn.Module) -> Dict[str, Dict]:
+ """
+ Return a dictionary containing name to a shape and dtype.
+ """
+ weight_buffer_meta = {}
+ for name, param in sorted(module.named_parameters()):
+ weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype}
+ return weight_buffer_meta
+
+
+def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]:
+ """Build the memory buffer given weight_buffer_meta
+
+ Args:
+ weight_buffer_meta: contains mapping from name to a dictionary containing shape and dtype of the tensors
+
+ Returns: a large memory buffer for each dtype that can hold all the tensors
+
+ """
+ memory_buffers = {}
+ total_numel_map = {} # map from dtype to the total numel
+ for name, meta_info in sorted(weight_buffer_meta.items()):
+ shape = meta_info['shape']
+ dtype = meta_info['dtype']
+
+ assert isinstance(shape, torch.Size)
+ assert isinstance(dtype, torch.dtype)
+
+ if dtype not in total_numel_map:
+ total_numel_map[dtype] = 0
+
+ total_numel_map[dtype] += calc_padded_numel(shape, dtype)
+
+ for dtype, total_numel in total_numel_map.items():
+ memory_buffers[dtype] = MemoryBuffer(total_numel, total_numel, dtype)
+
+ return memory_buffers
+
+
+def build_memory_reference_from_module(module: torch.nn.Module,
+ memory_buffers: Dict[torch.dtype, MemoryBuffer],
+ maintain_weight=True):
+ start_index = {}
+ for dtype in memory_buffers.keys():
+ start_index[dtype] = 0
+ for name, param in sorted(module.named_parameters()):
+ memory_buffer = memory_buffers[param.dtype]
+ buffer = memory_buffer.get(shape=param.shape, start_index=start_index[param.dtype])
+ # need to increment start_index
+ start_index[param.dtype] += calc_padded_numel(param.shape, dtype)
+ if maintain_weight:
+ buffer.copy_(param.data)
+ param.data = buffer
+
+
+def build_memory_reference(weight_buffer_meta: Dict[str, Dict], memory_buffers: Dict[torch.dtype, MemoryBuffer]):
+ """Build the memory references. The memory buffers are built using the build_memory_buffer API.
+ This API will allocate a weight buffer pointer to the memory buffer according to the weight_buffer_meta.
+
+ Args:
+ weight_buffer_meta:
+ memory_buffers:
+
+ Returns:
+
+ """
+ start_idx = {}
+ weight_buffers = {}
+ for dtype in memory_buffers.keys():
+ start_idx[dtype] = 0
+
+ for name, meta_info in sorted(weight_buffer_meta.items()):
+ shape = meta_info['shape']
+ dtype = meta_info['dtype']
+
+ buffer = memory_buffers[dtype].get(shape, start_index=start_idx[dtype])
+ start_idx[dtype] += calc_padded_numel(shape, dtype)
+ weight_buffers[name] = buffer
+
+ return weight_buffers
+
+
+class MemoryBufferModuleWrapper:
+ """
+ Note that we do not design MemoryBufferModuleWrapper as an nn.Module due to
+ - It will change the checkpoint name
+ """
+
+ def __init__(self, module: nn.Module):
+ super().__init__()
+ self.module = module
+ self.weight_buffer_meta = get_weight_buffer_meta_from_module(self.module)
+ self.memory_buffers = build_memory_buffer(self.weight_buffer_meta)
+ build_memory_reference_from_module(self.module, self.memory_buffers)
+
+ def get_memory_buffers(self):
+ return self.memory_buffers
+
+ def get_weight_buffer_meta(self):
+ return self.weight_buffer_meta
+
+
+class MegatronMemoryBufferForRollout(object):
+ """
+ We assume that
+ - inference engine has tp + dp
+ - actor has tp + pp + dp
+ - the tp between inference engine and actor should be the same
+ - memory_buffers: contains a list of memory_buffers, each is a dict from dtype to MemoryBuffer
+ - weight_buffers: contains a list of weight_buffers, each is a dict from name to param
+ - named_parameters: a dict from name to parameter that normalizes the names from pp and vpp. Note that
+ the named_parameters may not be directly compatible with inference engine. User has to take care of
+ this part such as the layout mismatches. (e.g. qkv transpose)
+ - Note that weight_buffer, named_parameters and memory_buffers share the same underlying GPU memory.
+ - When doing weight sync, the data is transfer via memory buffers
+ """
+
+ def __init__(self, transform_memory_param_fn):
+ self._memory_buffers = []
+ self._weight_buffers = []
+ self._named_parameters = {}
+ self.transform_memory_param_fn = transform_memory_param_fn
+
+ def initialize_weight_buffer(self, weight_buffer_meta_pp: List[Dict[str, Dict]]):
+ """
+ Initialize the weight buffer. The weight buffer is obtained according to the actor. We will construct
+ a large buffer for each dtype in the weight_buffer.
+
+ Args:
+ weight_buffer_meta: contains pp models, each pp models contains a dictionary of mapping from
+
+ Returns: None
+
+ """
+ self.weight_buffer_meta_pp = weight_buffer_meta_pp
+
+ for weight_buffer_meta in self.weight_buffer_meta_pp:
+ memory_buffer = build_memory_buffer(weight_buffer_meta)
+ self._memory_buffers.append(memory_buffer)
+ self._weight_buffers.append(None)
+
+ def build_memory_reference(self):
+ for i, weight_buffer_meta in enumerate(self.weight_buffer_meta_pp):
+ self._weight_buffers[i] = build_memory_reference(weight_buffer_meta, self._memory_buffers[i])
+ self._named_parameters = self.transform_memory_param_fn(self._weight_buffers)
+
+ @property
+ def named_parameters(self):
+ return self._named_parameters
+
+ @property
+ def weight_buffers(self):
+ return self._weight_buffers
+
+ @property
+ def memory_buffers(self):
+ return self._memory_buffers
diff --git a/code/RL_model/verl/Search-R1/verl/utils/model.py b/code/RL_model/verl/Search-R1/verl/utils/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9002451a1dce34b8c844f907ee6ac487351b5314
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/model.py
@@ -0,0 +1,332 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities to create common models from huggingface
+"""
+import os
+import warnings
+from typing import Dict, Type
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, MistralForSequenceClassification
+from verl.models.registry import ModelRegistry
+
+
+class LambdaLayer(nn.Module):
+
+ def __init__(self, fn):
+ super().__init__()
+ self.fn = fn
+
+ def forward(self, *args, **kwargs):
+ return self.fn(*args, **kwargs)
+
+
+def squeeze(x):
+ return torch.squeeze(x, dim=-1)
+
+
+def update_model_config(module_config, override_config_kwargs):
+ for key, val in override_config_kwargs.items():
+ setattr(module_config, key, val)
+
+
+def get_huggingface_actor_config(model_name: str, override_config_kwargs=None, trust_remote_code=False) -> Dict:
+ if override_config_kwargs is None:
+ override_config_kwargs = {}
+ assert isinstance(override_config_kwargs, Dict), \
+ f'override_config_kwargs must be a dict, got {type(override_config_kwargs)}'
+ module_config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+ update_model_config(module_config, override_config_kwargs)
+
+ return module_config
+
+
+def create_huggingface_actor(model_name: str, override_config_kwargs=None, automodel_kwargs=None) -> nn.Module:
+ """
+
+ Args:
+ model_name:
+ actor_override_config_kwargs:
+
+ Returns:
+
+ """
+ if override_config_kwargs is None:
+ override_config_kwargs = {}
+ if automodel_kwargs is None:
+ automodel_kwargs = {}
+ assert isinstance(override_config_kwargs, Dict), \
+ f'override_config_kwargs must be a dict, got {type(override_config_kwargs)}'
+ module_config = get_huggingface_actor_config(model_name,
+ override_config_kwargs,
+ trust_remote_code=automodel_kwargs.get('trust_remote_code', False))
+ module: nn.Module = AutoModelForCausalLM.from_config(module_config, **automodel_kwargs)
+ return module
+
+
+def create_huggingface_critic(model_name: str, override_config_kwargs=None, automodel_kwargs=None) -> nn.Module:
+ """
+
+ Args:
+ model_name:
+ override_config_kwargs:
+
+ Returns:
+
+ """
+ critic_module: nn.Module = create_huggingface_actor(model_name,
+ override_config_kwargs=override_config_kwargs,
+ automodel_kwargs=automodel_kwargs)
+ if automodel_kwargs is None:
+ automodel_kwargs = {}
+ torch_dtype = automodel_kwargs.get('torch_dtype', torch.float32)
+ critic_module.lm_head = nn.Sequential(nn.Linear(critic_module.config.hidden_size, 1, dtype=torch_dtype),
+ LambdaLayer(fn=squeeze))
+ return critic_module
+
+
+def get_model_size(model: nn.Module, scale='auto'):
+ n_params = sum(p.numel() for p in model.parameters())
+
+ if scale == 'auto':
+ if n_params > 1e9:
+ scale = 'B'
+ elif n_params > 1e6:
+ scale = 'M'
+ elif n_params > 1e3:
+ scale = 'K'
+ else:
+ scale = ''
+
+ if scale == 'B':
+ n_params = n_params / 1e9
+ elif scale == 'M':
+ n_params = n_params / 1e6
+ elif scale == 'K':
+ n_params = n_params / 1e3
+ elif scale == '':
+ pass
+ else:
+ raise NotImplemented(f'Unknown scale {scale}')
+
+ return n_params, scale
+
+
+def print_model_size(model: nn.Module, name: str = None):
+ n_params, scale = get_model_size(model, scale='auto')
+ if name is None:
+ name = model.__class__.__name__
+ print(f'{name} contains {n_params:.2f}{scale} parameters')
+
+
+def create_random_mask(input_ids: torch.Tensor,
+ max_ratio_of_valid_token: float,
+ max_ratio_of_left_padding: float,
+ min_ratio_of_valid_token: float = 0):
+ """Create a random mask given input_ids. Support left padding and right padding.
+ Process:
+ - Sample valid token length
+ - Sample left_padding length
+ - Generate padding
+
+ Args:
+ input_ids:
+ shape (batch_size, seq_len)
+
+ Returns:
+
+ """
+ assert max_ratio_of_valid_token > 0 and max_ratio_of_valid_token <= 1.
+ assert max_ratio_of_left_padding >= 0 and max_ratio_of_left_padding < 1.
+ assert min_ratio_of_valid_token <= max_ratio_of_valid_token
+
+ batch_size, sequence_length = input_ids.shape
+ max_num_valid_tokens = int(sequence_length * max_ratio_of_valid_token)
+ min_num_valid_tokens = max(1, int(sequence_length * min_ratio_of_valid_token))
+ max_left_padding = int(sequence_length * max_ratio_of_left_padding)
+ assert max_num_valid_tokens + max_left_padding <= sequence_length
+ assert max_num_valid_tokens > 0 and max_ratio_of_valid_token <= sequence_length
+ masks = torch.ones_like(input_ids, dtype=torch.int64)
+ # TODO: we can make this faster
+ for i in range(batch_size):
+ num_left_padding = np.random.randint(low=0, high=max_left_padding + 1, dtype=np.int64)
+ num_valid = np.random.randint(low=min_num_valid_tokens, high=max_num_valid_tokens + 1, dtype=np.int64)
+
+ for index in range(num_left_padding):
+ masks[i, index] = 0
+
+ for index in range(num_left_padding + num_valid, sequence_length):
+ masks[i, index] = 0
+ return masks
+
+
+def compute_position_id_with_mask(mask):
+ return torch.clip(torch.cumsum(mask, dim=-1) - 1, min=0, max=None)
+
+
+def normalize_pp_vpp_params(params, num_hidden_layers, layer_name='layers'):
+ """
+ Normalize the pp vpp params into a complete named parameters.
+ This is useful when gather parameters from pp ranks and passed to a model without pp
+
+ params: List[List[Dict[str, param]]]
+ params contains a list of pp, with a list of vpp named_parameters in each vpp chunk.
+ output: Dict[str, param]
+
+ """
+
+ def normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_layers):
+ """
+ Transform the model name in each model_chunk in each pp stage into the name in inference engine
+ """
+ if vpp_size > 1:
+ # print(f'try to bind vpp params to inference engine...')
+ layers_per_pp = num_layers // pp_size
+ layers_per_vpp = layers_per_pp // vpp_size
+ pp_offset = layers_per_vpp * pp_rank
+ vpp_offset = (layers_per_vpp * pp_size) * vpp_rank
+ layer_offset = pp_offset + vpp_offset
+ else:
+ layers_per_pp = num_layers // pp_size
+ layer_offset = layers_per_pp * pp_rank
+
+ if layer_name in name: # belong to an intermediate layer
+ split_name = name.split('.')
+ # find the num next to split_name
+ for i, name in enumerate(split_name):
+ if name == layer_name:
+ break
+ layer_num_idx = i + 1
+ # check the name
+ assert len(split_name) >= layer_num_idx + 1, f'split_name = {split_name}'
+ assert split_name[layer_num_idx].isdigit(), f'split_name = {split_name}'
+ # increment layer_num_idx by layer_offset
+ split_name[layer_num_idx] = str(int(split_name[layer_num_idx]) + layer_offset)
+ name = '.'.join(split_name) # weight name in inference_tp_model
+ return name
+
+ pp_size = len(params)
+ normalized_name_to_param = {}
+ for pp_rank in range(len(params)):
+ vpp_size = len(params[pp_rank])
+ for vpp_rank in range(vpp_size):
+ for name, param in params[pp_rank][vpp_rank].items():
+ normalized_name = normalize_model_name(name, pp_rank, vpp_rank, pp_size, vpp_size, num_hidden_layers)
+ normalized_name_to_param[normalized_name] = param
+
+ return normalized_name_to_param
+
+
+def get_parallel_model_from_config(config, megatron_config, pre_process=None, post_process=None, value=False):
+ from megatron.core import ModelParallelConfig
+ assert isinstance(megatron_config, ModelParallelConfig)
+ model_class = _get_parallel_model_architecture_from_config(config, value)
+
+ model = model_class(config, megatron_config, pre_process=pre_process, post_process=post_process)
+ return model
+
+
+def _get_parallel_model_architecture_from_config(config: PretrainedConfig, value=False) -> Type[nn.Module]:
+ architectures = getattr(config, "architectures", [])
+ for arch in architectures:
+ model_cls = ModelRegistry.load_model_cls(arch, value)
+ if model_cls is not None:
+ return model_cls
+ raise ValueError(f"Model architectures {architectures} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def load_megatron_model_weights(config,
+ model_config,
+ parallel_model,
+ params_dtype,
+ is_value_model=False,
+ local_cache_path='~/.cache/verl/rlhf'):
+ assert hasattr(model_config, "architectures"), "architectures cannot be empty when load weight!"
+ architectures = getattr(model_config, "architectures", [])
+ local_cache_path = os.path.expanduser(local_cache_path)
+
+ if config.model.path.startswith("hdfs:"):
+ from verl.utils.fs import copy_local_path_from_hdfs
+ print(f'start download from {config.model.path}')
+ local_model_path = copy_local_path_from_hdfs(src=config.model.path, cache_dir=local_cache_path)
+ print('finish download')
+ else:
+ print(f"load from local dir {config.model.path}")
+ local_model_path = config.model.path
+
+ # TODO: to find a better way to load mistral7b-rm lm_head
+ if 'mistral7b-rm' in config.model.path:
+ model = MistralForSequenceClassification.from_pretrained(local_model_path) # use score head instead of lm_head
+ state_dict = model.state_dict()
+ state_dict['lm_head.weight'] = state_dict['score.weight']
+ state_dict['model.embed_tokens.weight'] = state_dict[
+ 'model.embed_tokens.weight'][:32000] # workaround, 32001 -> 32000
+ is_value_model = True
+ else:
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ model = AutoModelForCausalLM.from_pretrained(local_model_path)
+ state_dict = model.state_dict()
+
+ from verl.models.weight_loader_registry import get_weight_loader
+ print(f'before weight loader: architectures = {architectures}...')
+ for arch in architectures:
+ print(f'call weight loader arch = {arch}, model config = {model.config}')
+ weight_loader = get_weight_loader(arch)
+ weight_loader(state_dict=state_dict,
+ wrapped_models=parallel_model,
+ config=model.config,
+ params_dtype=params_dtype,
+ is_value_model=is_value_model)
+
+
+# pad input_ids_rmpad, cu_seqlens and max_seqlen_in_batch to be divisible by tp
+def pad_packed_inputs(unpad_tokens: torch.Tensor, cu_seqlens, max_seqlen_in_batch, size):
+ """pad the tokens such that the total length is a multiple of size.
+ This function is useful when applying sequence parallel and context parallel
+
+ Args:
+ unpad_tokens: (total_nnz, ...). Tokens after removing padding
+ cu_seqlens: (total_nnz + 1,)
+ max_seqlen_in_batch: int
+
+ Returns:
+
+ """
+ F = nn.functional
+
+ total_nnz = unpad_tokens.shape[0]
+
+ if total_nnz % size == 0:
+ pad_size = 0
+ else:
+ pad_size = size - total_nnz % size
+
+ # we assume adding a new data in the batch with seqlen pad_size
+ if pad_size > 0:
+ if unpad_tokens.ndim == 1:
+ unpad_tokens = F.pad(unpad_tokens, (0, pad_size))
+ elif unpad_tokens.ndim == 2:
+ unpad_tokens = F.pad(unpad_tokens, (0, 0, 0, pad_size))
+ else:
+ raise NotImplementedError(f'Padding dim {unpad_tokens.ndim()} is not supported')
+
+ cu_seqlens = F.pad(cu_seqlens, (0, 1), value=pad_size + cu_seqlens[-1])
+ max_seqlen_in_batch = max(max_seqlen_in_batch, pad_size)
+
+ return unpad_tokens, cu_seqlens, max_seqlen_in_batch
diff --git a/code/RL_model/verl/Search-R1/verl/utils/py_functional.py b/code/RL_model/verl/Search-R1/verl/utils/py_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5a0e176779cc19d3035a3af77a1bdf1f39349a
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/py_functional.py
@@ -0,0 +1,56 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contain small python utility functions
+"""
+
+from typing import Dict
+from types import SimpleNamespace
+
+
+def union_two_dict(dict1: Dict, dict2: Dict):
+ """Union two dict. Will throw an error if there is an item not the same object with the same key.
+
+ Args:
+ dict1:
+ dict2:
+
+ Returns:
+
+ """
+ for key, val in dict2.items():
+ if key in dict1:
+ assert dict2[key] == dict1[key], \
+ f'{key} in meta_dict1 and meta_dict2 are not the same object'
+ dict1[key] = val
+
+ return dict1
+
+
+def append_to_dict(data: Dict, new_data: Dict):
+ for key, val in new_data.items():
+ if key not in data:
+ data[key] = []
+ data[key].append(val)
+
+
+class NestedNamespace(SimpleNamespace):
+
+ def __init__(self, dictionary, **kwargs):
+ super().__init__(**kwargs)
+ for key, value in dictionary.items():
+ if isinstance(value, dict):
+ self.__setattr__(key, NestedNamespace(value))
+ else:
+ self.__setattr__(key, value)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py b/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a75df6c37bc5a295aaa192b2a56cca2423e94b9
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/ray_utils.py
@@ -0,0 +1,43 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contains commonly used utilities for ray
+"""
+
+import ray
+
+import concurrent.futures
+
+
+def parallel_put(data_list, max_workers=None):
+
+ def put_data(index, data):
+ return index, ray.put(data)
+
+ if max_workers is None:
+ max_workers = min(len(data_list), 16)
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+ data_list_f = [executor.submit(put_data, i, data) for i, data in enumerate(data_list)]
+ res_lst = []
+ for future in concurrent.futures.as_completed(data_list_f):
+ res_lst.append(future.result())
+
+ # reorder based on index
+ output = [None for _ in range(len(data_list))]
+ for res in res_lst:
+ index, data_ref = res
+ output[index] = data_ref
+
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/utils/rendezvous/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/rendezvous/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/rendezvous/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/utils/rendezvous/ray_backend.py b/code/RL_model/verl/Search-R1/verl/utils/rendezvous/ray_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d2bd906fe14584896627143dea2d3ec032d912
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/rendezvous/ray_backend.py
@@ -0,0 +1,77 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+
+from cupy.cuda.nccl import NcclCommunicator, get_unique_id
+
+import ray
+from ray.util import list_named_actors
+
+
+@ray.remote
+class NCCLIDStore:
+
+ def __init__(self, nccl_id):
+ self._nccl_id = nccl_id
+
+ def get(self):
+ return self._nccl_id
+
+
+def get_nccl_id_store_by_name(name):
+ all_actors = list_named_actors(all_namespaces=True)
+ matched_actors = [actor for actor in all_actors if actor.get("name", None) == name]
+ if len(matched_actors) == 1:
+ actor = matched_actors[0]
+ return ray.get_actor(**actor)
+ elif len(matched_actors) > 1:
+ logging.warning(f"multiple actors with same name found: {matched_actors}")
+ elif len(matched_actors) == 0:
+ logging.info(f"failed to get any actor named {name}")
+ return None
+
+
+def create_nccl_communicator_in_ray(rank: int,
+ world_size: int,
+ group_name: str,
+ max_retries: int = 100,
+ interval_s: int = 5):
+ if rank == 0:
+ nccl_id = get_unique_id()
+ nccl_id_store = NCCLIDStore.options(name=group_name).remote(nccl_id)
+
+ assert ray.get(nccl_id_store.get.remote()) == nccl_id
+ communicator = NcclCommunicator(
+ ndev=world_size,
+ commId=nccl_id,
+ rank=0,
+ )
+ return communicator
+ else:
+ for i in range(max_retries):
+ nccl_id_store = get_nccl_id_store_by_name(group_name)
+ if nccl_id_store is not None:
+ logging.info(f"nccl_id_store {group_name} got")
+ nccl_id = ray.get(nccl_id_store.get.remote())
+ logging.info(f"nccl id for {group_name} got: {nccl_id}")
+ communicator = NcclCommunicator(
+ ndev=world_size,
+ commId=nccl_id,
+ rank=rank,
+ )
+ return communicator
+ logging.info(f"failed to get nccl_id for {i+1} time, sleep for {interval_s} seconds")
+ time.sleep(interval_s)
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/__init__.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/countdown.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/countdown.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d414018314b6f0950cd201d09927f883e2216d
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/countdown.py
@@ -0,0 +1,111 @@
+import re
+import random
+import ast
+import operator
+
+
+def extract_solution(solution_str):
+ """Extract the equation from the solution string."""
+ # Remove everything before the first "Assistant:"
+ if "Assistant:" in solution_str:
+ solution_str = solution_str.split("Assistant:", 1)[1]
+ elif "<|im_start|>assistant" in solution_str:
+ solution_str = solution_str.split("<|im_start|>assistant", 1)[1]
+ else:
+ return None
+ solution_str = solution_str.split('\n')[-1]
+
+ answer_pattern = r'(.*?)'
+ match = re.finditer(answer_pattern, solution_str)
+ matches = list(match)
+ if matches:
+ final_answer = matches[-1].group(1).strip()
+ else:
+ final_answer = None
+ return final_answer
+
+
+def validate_equation(equation_str, available_numbers):
+ """Validate that equation only uses available numbers and each number once."""
+ try:
+ # Extract all numbers from the equation
+ numbers_in_eq = [int(n) for n in re.findall(r'\d+', equation_str)]
+
+ # Check if all numbers in equation are available
+ available_numbers = sorted(available_numbers)
+ numbers_in_eq = sorted(numbers_in_eq)
+
+ # Each number should be used exactly once
+ return numbers_in_eq == available_numbers
+ except:
+ return False
+
+
+def evaluate_equation(equation_str):
+ """Safely evaluate the arithmetic equation using eval() with precautions."""
+ try:
+ # Define a regex pattern that only allows numbers, operators, parentheses, and whitespace
+ allowed_pattern = r'^[\d+\-*/().\s]+$'
+ if not re.match(allowed_pattern, equation_str):
+ raise ValueError("Invalid characters in equation.")
+
+ # Evaluate the equation with restricted globals and locals
+ result = eval(equation_str, {"__builtins__": None}, {})
+ return result
+ except Exception as e:
+ return None
+
+
+def compute_score(solution_str, ground_truth, method='strict', format_score=0.1, score=1.):
+ """The scoring function for countdown task.
+
+ Args:
+ solution_str: the solution text
+ ground_truth: dictionary containing target number and available numbers
+ method: the method to extract the solution
+ format_score: the score for correct format but wrong answer
+ score: the score for the correct answer
+ """
+ target = ground_truth['target']
+ numbers = ground_truth['numbers']
+
+ equation = extract_solution(solution_str=solution_str)
+ do_print = random.randint(1, 64) == 1
+
+ if do_print:
+ print(f"--------------------------------")
+ print(f"Target: {target} | Numbers: {numbers}")
+ print(f"Extracted equation: {equation}")
+ print(f"Solution string: {solution_str}")
+
+ if equation is None:
+ if do_print:
+ print(f"No equation found")
+ return 0
+
+ # Validate equation uses correct numbers
+ if not validate_equation(equation, numbers):
+ if do_print:
+ print(f"Invalid equation")
+ return format_score
+
+ # Evaluate equation
+ try:
+ result = evaluate_equation(equation)
+ if result is None:
+ if do_print:
+ print(f"Could not evaluate equation")
+ return format_score
+
+ if abs(result - target) < 1e-5: # Account for floating point precision
+ if do_print:
+ print(f"Correct equation: {equation} = {result}")
+ return score
+ else:
+ if do_print:
+ print(f"Wrong result: equation = {result}, target = {target}")
+ return format_score
+ except:
+ if do_print:
+ print(f"Error evaluating equation")
+ return format_score
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/gsm8k.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/gsm8k.py
new file mode 100644
index 0000000000000000000000000000000000000000..7091037643bc656f93c5c1a6acefb643d58421fe
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/gsm8k.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def extract_solution(solution_str, method='strict'):
+ assert method in ['strict', 'flexible']
+
+ if method == 'strict':
+ # this also tests the formatting of the model
+ solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
+ if solution is None:
+ final_answer = None
+ else:
+ final_answer = solution.group(0)
+ final_answer = final_answer.split('#### ')[1].replace(',', '').replace('$', '')
+ elif method == 'flexible':
+ answer = re.findall("(\\-?[0-9\\.\\,]+)", solution_str)
+ final_answer = None
+ if len(answer) == 0:
+ # no reward is there is no answer
+ pass
+ else:
+ invalid_str = ['', '.']
+ # find the last number that is not '.'
+ for final_answer in reversed(answer):
+ if final_answer not in invalid_str:
+ break
+ return final_answer
+
+
+def compute_score(solution_str, ground_truth, method='strict', format_score=0., score=1.):
+ """The scoring function for GSM8k.
+
+ Reference: Trung, Luong, et al. "Reft: Reasoning with reinforced fine-tuning." Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2024.
+
+ Args:
+ solution_str: the solution text
+ ground_truth: the ground truth
+ method: the method to extract the solution, choices are 'strict' and 'flexible'
+ format_score: the score for the format
+ score: the score for the correct answer
+ """
+ answer = extract_solution(solution_str=solution_str, method=method)
+ if answer is None:
+ return 0
+ else:
+ if answer == ground_truth:
+ return score
+ else:
+ return format_score
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/math.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..50792aa6edd082091a786f4d4fa29d0a601702cf
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/math.py
@@ -0,0 +1,227 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py
+
+
+def compute_score(solution_str, ground_truth) -> float:
+ retval = 0.
+ try:
+ string_in_last_boxed = last_boxed_only_string(solution_str)
+ if string_in_last_boxed is not None:
+ answer = remove_boxed(string_in_last_boxed)
+ if is_equiv(answer, ground_truth):
+ retval = 1.
+ except Exception as e:
+ print(e)
+
+ return retval
+
+
+# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
+def is_equiv(str1, str2, verbose=False):
+ if str1 is None and str2 is None:
+ print("WARNING: Both None")
+ return True
+ if str1 is None or str2 is None:
+ return False
+
+ try:
+ ss1 = strip_string(str1)
+ ss2 = strip_string(str2)
+ if verbose:
+ print(ss1, ss2)
+ return ss1 == ss2
+ except Exception:
+ return str1 == str2
+
+
+def remove_boxed(s):
+ if "\\boxed " in s:
+ left = "\\boxed "
+ assert s[:len(left)] == left
+ return s[len(left):]
+
+ left = "\\boxed{"
+
+ assert s[:len(left)] == left
+ assert s[-1] == "}"
+
+ return s[len(left):-1]
+
+
+def last_boxed_only_string(string):
+ idx = string.rfind("\\boxed")
+ if "\\boxed " in string:
+ return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+ if idx < 0:
+ idx = string.rfind("\\fbox")
+ if idx < 0:
+ return None
+
+ i = idx
+ right_brace_idx = None
+ num_left_braces_open = 0
+ while i < len(string):
+ if string[i] == "{":
+ num_left_braces_open += 1
+ if string[i] == "}":
+ num_left_braces_open -= 1
+ if num_left_braces_open == 0:
+ right_brace_idx = i
+ break
+ i += 1
+
+ if right_brace_idx is None:
+ retval = None
+ else:
+ retval = string[idx:right_brace_idx + 1]
+
+ return retval
+
+
+def fix_fracs(string):
+ substrs = string.split("\\frac")
+ new_str = substrs[0]
+ if len(substrs) > 1:
+ substrs = substrs[1:]
+ for substr in substrs:
+ new_str += "\\frac"
+ if substr[0] == "{":
+ new_str += substr
+ else:
+ try:
+ assert len(substr) >= 2
+ except AssertionError:
+ return string
+ a = substr[0]
+ b = substr[1]
+ if b != "{":
+ if len(substr) > 2:
+ post_substr = substr[2:]
+ new_str += "{" + a + "}{" + b + "}" + post_substr
+ else:
+ new_str += "{" + a + "}{" + b + "}"
+ else:
+ if len(substr) > 2:
+ post_substr = substr[2:]
+ new_str += "{" + a + "}" + b + post_substr
+ else:
+ new_str += "{" + a + "}" + b
+ string = new_str
+ return string
+
+
+def fix_a_slash_b(string):
+ if len(string.split("/")) != 2:
+ return string
+ a = string.split("/")[0]
+ b = string.split("/")[1]
+ try:
+ a = int(a)
+ b = int(b)
+ assert string == "{}/{}".format(a, b)
+ new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+ return new_string
+ except AssertionError:
+ return string
+
+
+def remove_right_units(string):
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
+ if "\\text{ " in string:
+ splits = string.split("\\text{ ")
+ assert len(splits) == 2
+ return splits[0]
+ else:
+ return string
+
+
+def fix_sqrt(string):
+ if "\\sqrt" not in string:
+ return string
+ splits = string.split("\\sqrt")
+ new_string = splits[0]
+ for split in splits[1:]:
+ if split[0] != "{":
+ a = split[0]
+ new_substr = "\\sqrt{" + a + "}" + split[1:]
+ else:
+ new_substr = "\\sqrt" + split
+ new_string += new_substr
+ return new_string
+
+
+def strip_string(string):
+ # linebreaks
+ string = string.replace("\n", "")
+
+ # remove inverse spaces
+ string = string.replace("\\!", "")
+
+ # replace \\ with \
+ string = string.replace("\\\\", "\\")
+
+ # replace tfrac and dfrac with frac
+ string = string.replace("tfrac", "frac")
+ string = string.replace("dfrac", "frac")
+
+ # remove \left and \right
+ string = string.replace("\\left", "")
+ string = string.replace("\\right", "")
+
+ # Remove circ (degrees)
+ string = string.replace("^{\\circ}", "")
+ string = string.replace("^\\circ", "")
+
+ # remove dollar signs
+ string = string.replace("\\$", "")
+
+ # remove units (on the right)
+ string = remove_right_units(string)
+
+ # remove percentage
+ string = string.replace("\\%", "")
+ string = string.replace("\%", "") # noqa: W605
+
+ # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+ string = string.replace(" .", " 0.")
+ string = string.replace("{.", "{0.")
+ # if empty, return empty string
+ if len(string) == 0:
+ return string
+ if string[0] == ".":
+ string = "0" + string
+
+ # to consider: get rid of e.g. "k = " or "q = " at beginning
+ if len(string.split("=")) == 2:
+ if len(string.split("=")[0]) <= 2:
+ string = string.split("=")[1]
+
+ # fix sqrt3 --> sqrt{3}
+ string = fix_sqrt(string)
+
+ # remove spaces
+ string = string.replace(" ", "")
+
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+ string = fix_fracs(string)
+
+ # manually change 0.5 --> \frac{1}{2}
+ if string == "0.5":
+ string = "\\frac{1}{2}"
+
+ # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+ string = fix_a_slash_b(string)
+
+ return string
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/multiply.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/multiply.py
new file mode 100644
index 0000000000000000000000000000000000000000..71737f94f0b095e1bb49f8f85290c7bee8539bc0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/multiply.py
@@ -0,0 +1,58 @@
+import re
+import random
+
+
+def extract_solution(solution_str):
+ # Remove everything before the first "Assistant:"
+ if "Assistant:" in solution_str:
+ solution_str = solution_str.split("Assistant:", 1)[1]
+ else:
+ return None
+
+ answer_pattern = r'(.*?)'
+ match = re.finditer(answer_pattern, solution_str)
+ matches = list(match)
+ if matches:
+ final_answer = matches[-1].group(1).strip()
+ else:
+ final_answer = None
+ if final_answer is not None:
+ try:
+ int_final_answer = int(final_answer)
+ except ValueError:
+ final_answer = None
+ return final_answer
+
+
+def compute_score(solution_str, ground_truth, method='strict', format_score=0.1, score=1.):
+ """The scoring function for GSM8k.
+
+ Reference: Trung, Luong, et al. "Reft: Reasoning with reinforced fine-tuning." Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2024.
+
+ Args:
+ solution_str: the solution text
+ ground_truth: the ground truth
+ method: the method to extract the solution, choices are 'strict' and 'flexible'
+ format_score: the score for the format
+ score: the score for the correct answer
+ """
+ answer = extract_solution(solution_str=solution_str)
+ do_print = random.randint(1, 64) == 1
+ if do_print:
+ print(f"--------------------------------")
+ print(f"Ground truth: {ground_truth} | Extracted answer: {answer}")
+ print(f"Solution string: {solution_str}")
+
+ if answer is None:
+ if do_print:
+ print(f"No answer found")
+ return 0
+ else:
+ if int(answer) == int(ground_truth):
+ if do_print:
+ print(f"Correct answer: {answer}")
+ return score
+ else:
+ if do_print:
+ print(f"Incorrect answer {answer} | Ground truth: {ground_truth}")
+ return format_score
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/qa_em.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/qa_em.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0282034b0099c09ed200f78215cf239b45ec68
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/qa_em.py
@@ -0,0 +1,138 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import string
+import random
+
+def normalize_answer(s):
+ def remove_articles(text):
+ return re.sub(r"\b(a|an|the)\b", " ", text)
+
+ def white_space_fix(text):
+ return " ".join(text.split())
+
+ def remove_punc(text):
+ exclude = set(string.punctuation)
+ return "".join(ch for ch in text if ch not in exclude)
+
+ def lower(text):
+ return text.lower()
+
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def em_check(prediction, golden_answers):
+ if isinstance(golden_answers, str):
+ golden_answers = [golden_answers]
+ normalized_prediction = normalize_answer(prediction)
+ score = 0
+ for golden_answer in golden_answers:
+ golden_answer = normalize_answer(golden_answer)
+ if golden_answer == normalized_prediction:
+ score = 1
+ break
+ return score
+
+
+def subem_check(prediction, golden_answers):
+ if isinstance(golden_answers, str):
+ golden_answers = [golden_answers]
+ normalized_prediction = normalize_answer(prediction)
+ score = 0
+ for golden_answer in golden_answers:
+ golden_answer = normalize_answer(golden_answer)
+ if golden_answer in normalized_prediction:
+ score = 1
+ break
+ return score
+
+
+def extract_solution(solution_str):
+ """Extract the equation from the solution string."""
+ # Remove everything before the first "Assistant:"
+ # if "Assistant:" in solution_str:
+ # solution_str = solution_str.split("Assistant:", 1)[1]
+ # elif "<|im_start|>assistant" in solution_str:
+ # solution_str = solution_str.split("<|im_start|>assistant", 1)[1]
+ # else:
+ # return None
+ # solution_str = solution_str.split('\n')[-1]
+
+ answer_pattern = r'(.*?)'
+ match = re.finditer(answer_pattern, solution_str, re.DOTALL)
+ matches = list(match)
+
+ # If there are 0 or exactly 1 matches, return None
+ if len(matches) <= 1:
+ return None
+
+ # If there are 2 or more matches, return the last one
+ return matches[-1].group(1).strip()
+
+
+def compute_score_em(solution_str, ground_truth, method='strict', format_score=0., score=1.):
+ """The scoring function for exact match (EM).
+
+ Args:
+ solution_str: the solution text
+ ground_truth: the ground truth
+ method: the method to extract the solution, choices are 'strict' and 'flexible'
+ format_score: the score for the format
+ score: the score for the correct answer
+ """
+ answer = extract_solution(solution_str=solution_str)
+ do_print = random.randint(1, 64) == 1
+
+ if do_print:
+ print(f"--------------------------------")
+ print(f"Golden answers: {ground_truth['target']}")
+ print(f"Extracted answer: {answer}")
+ print(f"Solution string: {solution_str}")
+
+ if answer is None:
+ return 0
+ else:
+ if em_check(answer, ground_truth['target']):
+ return score
+ else:
+ return format_score
+
+
+def compute_score_subem(solution_str, ground_truth, method='strict', format_score=0., score=1.):
+ """The scoring function for substring exact match (EM).
+
+ Args:
+ solution_str: the solution text
+ ground_truth: the ground truth
+ method: the method to extract the solution, choices are 'strict' and 'flexible'
+ format_score: the score for the format
+ score: the score for the correct answer
+ """
+ answer = extract_solution(solution_str=solution_str)
+ do_print = random.randint(1, 64) == 1
+
+ if do_print:
+ print(f"--------------------------------")
+ print(f"Golden answers: {ground_truth['target']}")
+ print(f"Extracted answer: {answer}")
+ print(f"Solution string: {solution_str}")
+
+ if answer is None:
+ return 0
+ else:
+ if subem_check(answer, ground_truth['target']):
+ return score
+ else:
+ return format_score
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/qa_em_format.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/qa_em_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..a95f70e22c86a813e9f0e7316c255988898d828f
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/qa_em_format.py
@@ -0,0 +1,197 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import string
+import random
+
+def normalize_answer(s):
+ def remove_articles(text):
+ return re.sub(r"\b(a|an|the)\b", " ", text)
+
+ def white_space_fix(text):
+ return " ".join(text.split())
+
+ def remove_punc(text):
+ exclude = set(string.punctuation)
+ return "".join(ch for ch in text if ch not in exclude)
+
+ def lower(text):
+ return text.lower()
+
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def em_check(prediction, golden_answers):
+ if isinstance(golden_answers, str):
+ golden_answers = [golden_answers]
+ normalized_prediction = normalize_answer(prediction)
+ score = 0
+ for golden_answer in golden_answers:
+ golden_answer = normalize_answer(golden_answer)
+ if golden_answer == normalized_prediction:
+ score = 1
+ break
+ return score
+
+
+def is_valid_sequence(text):
+ # Find the position of "<|im_start|>assistant" with potential whitespace
+ assistant_pattern = r"<\|im_start\|>assistant\s*"
+ assistant_match = re.search(assistant_pattern, text)
+
+ if not assistant_match:
+ return False, "Missing assistant marker"
+
+ # Extract the content after the assistant marker
+ start_pos = assistant_match.end()
+ content = text[start_pos:]
+
+ # Check for balanced tags
+ tags_to_check = ["think", "search", "information", "answer"]
+ for tag in tags_to_check:
+ opening_count = len(re.findall(f"<{tag}>", content))
+ closing_count = len(re.findall(f"{tag}>", content))
+ if opening_count != closing_count:
+ return False, f"Mismatch in {tag} tags: {opening_count} opening vs {closing_count} closing tags"
+
+ # Now check for proper sequence pattern and no extraneous content
+
+ # 1. First split the content by any tags we recognize
+ split_pattern = r"(?(?:think|search|information|answer)>)"
+ parts = re.split(split_pattern, content)
+
+ # 2. Keep track of the current position in the expected sequence
+ state = "start" # start -> think -> search -> information -> think -> ... -> answer -> end
+
+ # 3. Check each part
+ for i, part in enumerate(parts):
+ # Skip empty parts
+ if not part.strip():
+ continue
+
+ # Check if this is a tag
+ if re.match(r"?(?:think|search|information|answer)>", part):
+ # This is a tag, check if it's valid in the current state
+ if part == "" and state in ["start", "information"]:
+ state = "in_think"
+ elif part == "" and state == "in_think":
+ state = "after_think"
+ elif part == "" and state == "after_think":
+ state = "in_search"
+ elif part == "" and state == "in_search":
+ state = "after_search"
+ elif part == "" and state == "after_search":
+ state = "in_information"
+ elif part == "" and state == "in_information":
+ state = "information"
+ elif part == "" and state == "after_think":
+ state = "in_answer"
+ elif part == "" and state == "in_answer":
+ state = "end"
+ else:
+ return False, f"Unexpected tag {part} in state {state}"
+ else:
+ # This is content, check if it's valid in the current state
+ if state in ["in_think", "in_search", "in_information", "in_answer"]:
+ # Content is allowed inside tags
+ pass
+ elif state in ["start", "after_think", "after_search", "information"]:
+ # Only whitespace is allowed between tags
+ if part.strip():
+ return False, f"Unexpected content '{part.strip()}' between tags (state: {state})"
+ else:
+ return False, f"Unexpected content in state {state}"
+
+ # Check final state
+ if state != "end":
+ return False, f"Incomplete sequence, ended in state {state}"
+
+ return True, "Valid sequence format"
+
+
+def extract_solution(solution_str):
+ """Extract the equation from the solution string."""
+
+ answer_pattern = r'(.*?)'
+ match = re.finditer(answer_pattern, solution_str, re.DOTALL)
+ matches = list(match)
+
+ # If there are 0 or exactly 1 matches, return None
+ if len(matches) <= 1:
+ return None
+
+ # If there are 2 or more matches, return the last one
+ return matches[-1].group(1).strip()
+
+
+def extract_information_blocks(text: str) -> list[str]:
+ pattern = r"(.*?)"
+ matches = re.findall(pattern, text, re.DOTALL)
+ return [match.strip() for match in matches]
+
+
+def is_retrieval_correct(text: str, golden_answers: list[str]) -> list[str]:
+ seqs = extract_information_blocks(text)
+ for seq in seqs:
+ for golden_answer in golden_answers:
+ if normalize_answer(golden_answer) in normalize_answer(seq):
+ return True
+ return False
+
+
+def compute_score_em(solution_str, ground_truth, method='strict', structure_format_score=0, final_format_score=0, retrieval_score=0, format_score=0, score=1.):
+ """The scoring function for exact match (EM).
+
+ Args:
+ solution_str: the solution text
+ ground_truth: the ground truth
+ method: the method to extract the solution, choices are 'strict' and 'flexible'
+ format_score: the score for the format
+ score: the score for the correct answer
+ """
+ is_valid_format, _ = is_valid_sequence(solution_str)
+ retrieval_correct = False
+ if is_valid_format:
+ retrieval_correct = is_retrieval_correct(solution_str, ground_truth['target'])
+ answer = extract_solution(solution_str=solution_str)
+ do_print = random.randint(1, 64) == 1
+
+ if do_print:
+ print(f"--------------------------------")
+ print(f"Golden answers: {ground_truth['target']}")
+ print(f"Extracted answer: {answer}")
+ print(f"Solution string: {solution_str}")
+
+ if answer is None:
+ if is_valid_format:
+ if retrieval_correct:
+ return structure_format_score + retrieval_score # 0.3
+ else:
+ return structure_format_score # 0.2
+ else:
+ return 0
+ else:
+ if em_check(answer, ground_truth['target']):
+ if is_valid_format:
+ return score # 1
+ else:
+ return score - structure_format_score # 0.8
+ elif is_valid_format:
+ if retrieval_correct:
+ return structure_format_score + retrieval_score # 0.3
+ else:
+ return structure_format_score # 0.2
+ else:
+ return final_format_score # 0.1
diff --git a/code/RL_model/verl/Search-R1/verl/utils/reward_score/reward.py b/code/RL_model/verl/Search-R1/verl/utils/reward_score/reward.py
new file mode 100644
index 0000000000000000000000000000000000000000..b191ac40e566c1edfd9b1f53121aefe1d66ea412
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/reward_score/reward.py
@@ -0,0 +1,93 @@
+import json
+import re
+import concurrent.futures
+from openai import OpenAI
+
+class MedicalClaimVerifier:
+ def __init__(self):
+ # Update path as needed for your environment
+ api_file = "/home/mshahidul/api_new.json"
+ with open(api_file, "r") as f:
+ api_keys = json.load(f)
+ self.api_key = api_keys["openai"]
+ self.model_name = "gpt-5-nano" # Changed to a currently available model
+ self.client = OpenAI(api_key=self.api_key)
+
+ self.thresholds = {
+ "low": {"comp": 1.0, "cov": 0.3226},
+ "intermediate": {"comp": 1.0, "cov": 0.4091},
+ "proficient": {"comp": 1.0, "cov": 0.9347},
+ }
+
+ def get_prompt(self, context, claim):
+ return f"CONTEXT:\n{context}\n\nCLAIM TO VERIFY:\n{claim}\n\nINSTRUCTION:\nDoes the CONTEXT support the CLAIM? Output only 'supported' or 'not_supported'."
+
+ def check_support_api(self, prompt):
+ try:
+ response = self.client.chat.completions.create(
+ model=self.model_name,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ res = response.choices[0].message.content.strip().lower()
+ return 1.0 if "supported" in res and "not_supported" not in res else 0.0
+ except:
+ return 0.0
+
+ def evaluate_level(self, gen_text, gold_subs, full_subs, level_key):
+ if not gen_text: return 0.0, 0.0
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+ comp_results = list(executor.map(self.check_support_api, [self.get_prompt(gen_text, s) for s in gold_subs]))
+ cov_results = list(executor.map(self.check_support_api, [self.get_prompt(gen_text, s) for s in full_subs]))
+
+ comp_score = sum(comp_results) / len(comp_results) if comp_results else 0.0
+ cov_score = sum(cov_results) / len(cov_results) if cov_results else 0.0
+ return comp_score, cov_score
+
+# Global instance for the trainer
+verifier = MedicalClaimVerifier()
+
+def compute_score(data_source, solution_str, ground_truth, extra_info=None):
+ """
+ Standard verl entrypoint for reward calculation.
+ ground_truth is expected to be a JSON string containing 'gold_subs' and 'full_subs'.
+ """
+ # 1. Parse Ground Truth
+ try:
+ gt_data = json.loads(ground_truth)
+ gold_subs = gt_data['gold_subs']
+ full_subs = gt_data['full_subs']
+ except Exception:
+ return 0.0 # Return neutral if GT is mangled
+
+ # 2. Extract JSON from Model Response
+ try:
+ # Clean markdown wrappers
+ cleaned_str = solution_str.strip()
+ if cleaned_str.startswith("```json"):
+ cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip()
+ elif "```" in cleaned_str:
+ cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip()
+
+ data = json.loads(cleaned_str)
+ except Exception:
+ return -5.0 # Format penalty
+
+ # 3. Scoring Logic
+ levels = ["low", "intermediate", "proficient"]
+ if not all(f"{lvl}_health_literacy" in data for lvl in levels):
+ return -2.0
+
+ total_reward = 0.0
+ for lvl in levels:
+ gen_text = data.get(f"{lvl}_health_literacy", "")
+ if not gen_text:
+ total_reward -= 2.0
+ continue
+
+ comp_score, cov_score = verifier.evaluate_level(gen_text, gold_subs, full_subs, lvl)
+
+ # Binary reward based on thresholds
+ total_reward += 1.0 if comp_score >= verifier.thresholds[lvl]["comp"] else -1.0
+ total_reward += 1.0 if cov_score >= verifier.thresholds[lvl]["cov"] else -1.0
+
+ return total_reward
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py b/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee45da0d33264ea40591f95a98bdf35ef0ea4ad
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/seqlen_balancing.py
@@ -0,0 +1,265 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple, Callable
+import heapq
+
+import torch
+from torch import distributed as dist
+
+from tensordict import TensorDict
+import copy
+
+
+def karmarkar_karp(seqlen_list: List[int], k_partitions: int, equal_size: bool):
+ # see: https://en.wikipedia.org/wiki/Largest_differencing_method
+ class Set:
+
+ def __init__(self) -> None:
+ self.sum = 0
+ self.items = []
+
+ def add(self, idx: int, val: int):
+ self.items.append((idx, val))
+ self.sum += val
+
+ def merge(self, other):
+ for idx, val in other.items:
+ self.items.append((idx, val))
+ self.sum += val
+
+ def __lt__(self, other):
+ if self.sum != other.sum:
+ return self.sum < other.sum
+ if len(self.items) != len(other.items):
+ return len(self.items) < len(other.items)
+ return self.items < other.items
+
+ class State:
+
+ def __init__(self, items: List[Tuple[int, int]], k: int) -> None:
+ self.k = k
+ # sets should always be decreasing order
+ self.sets = [Set() for _ in range(k)]
+ assert len(items) in [1, k], f"{len(items)} not in [1, {k}]"
+ for i, (idx, seqlen) in enumerate(items):
+ self.sets[i].add(idx=idx, val=seqlen)
+ self.sets = sorted(self.sets, reverse=True)
+
+ def spread(self):
+ return self.sets[0].sum - self.sets[-1].sum
+
+ def get_partitions(self):
+ partitions = []
+ for i in range(len(self.sets)):
+ cur_partition = []
+ for idx, _ in self.sets[i].items:
+ cur_partition.append(idx)
+ partitions.append(cur_partition)
+ return partitions
+
+ def merge(self, other):
+ for i in range(self.k):
+ self.sets[i].merge(other.sets[self.k - 1 - i])
+ self.sets = sorted(self.sets, reverse=True)
+
+ @property
+ def spread(self) -> int:
+ return self.sets[0].sum - self.sets[-1].sum
+
+ def __lt__(self, other):
+ # least heap, let the state with largest spread to be popped first,
+ # if the spread is the same, let the state who has the largest set
+ # to be popped first.
+ if self.spread != other.spread:
+ return self.spread > other.spread
+ return self.sets[0] > other.sets[0]
+
+ def __repr__(self) -> str:
+ repr_str = "["
+ for i in range(self.k):
+ if i > 0:
+ repr_str += ","
+ repr_str += "{"
+ for j, (_, seqlen) in enumerate(self.sets[i].items):
+ if j > 0:
+ repr_str += ","
+ repr_str += str(seqlen)
+ repr_str += "}"
+ repr_str += "]"
+ return repr_str
+
+ sorted_seqlen_list = sorted([(seqlen, i) for i, seqlen in enumerate(seqlen_list)])
+ states_pq = []
+ if equal_size:
+ assert len(seqlen_list) % k_partitions == 0, f"{len(seqlen_list)} % {k_partitions} != 0"
+ for offset in range(0, len(sorted_seqlen_list), k_partitions):
+ items = []
+ for i in range(k_partitions):
+ seqlen, idx = sorted_seqlen_list[offset + i]
+ items.append((idx, seqlen))
+ heapq.heappush(states_pq, State(items=items, k=k_partitions))
+ else:
+ for seqlen, idx in sorted_seqlen_list:
+ heapq.heappush(states_pq, State(items=[(idx, seqlen)], k=k_partitions))
+
+ while len(states_pq) > 1:
+ state0 = heapq.heappop(states_pq)
+ state1 = heapq.heappop(states_pq)
+ # merge states
+ state0.merge(state1)
+ heapq.heappush(states_pq, state0)
+
+ final_state = states_pq[0]
+ partitions = final_state.get_partitions()
+ if equal_size:
+ for i, partition in enumerate(partitions):
+ assert len(partition) * \
+ k_partitions == len(seqlen_list), f"{len(partition)} * {k_partitions} != {len(seqlen_list)}"
+ return partitions
+
+
+def greedy_partition(seqlen_list: List[int], k_partitions: int, equal_size: bool):
+ bias = sum(seqlen_list) + 1 if equal_size else 0
+ sorted_seqlen = [(seqlen + bias, i) for i, seqlen in enumerate(seqlen_list)]
+ partitions = [[] for _ in range(k_partitions)]
+ partition_sums = [0 for _ in range(k_partitions)]
+ for seqlen, i in sorted_seqlen:
+ min_idx = None
+ for j in range(k_partitions):
+ if min_idx is None or partition_sums[j] < partition_sums[min_idx]:
+ min_idx = j
+ partitions[min_idx].append(i)
+ partition_sums[min_idx] += seqlen
+ if equal_size:
+ for i, partition in enumerate(partitions):
+ assert len(partition) * \
+ k_partitions == len(seqlen_list), f"{len(partition)} * {k_partitions} != {len(seqlen_list)}"
+ return partitions
+
+
+def get_seqlen_balanced_partitions(seqlen_list: List[int], k_partitions: int, equal_size: bool):
+ """ get order of seq lengths to make partitions balanced, this is
+ used in balacing sum of seqlength across dp ranks and microbatches
+ Parameters:
+ seqlen_list (List[int]):
+ seq lengths of each items
+ k_partitions (int):
+ resulting number of partitions
+ equal_size (bool):
+ if True, number of items in each partitions must be equal.
+ if False, only consider balancing the sum, each partition can have
+ variable number of items
+ Returns:
+ partitions (List[List[int]]):
+ return k_partitions list containing the index of items.
+ """
+ assert len(seqlen_list) >= k_partitions, f"number of items:[{len(seqlen_list)}] < k_partitions:[{k_partitions}]"
+
+ def _check_and_sort_partitions(partitions):
+ assert len(partitions) == k_partitions, f"{len(partitions)} != {k_partitions}"
+ seen_idx = set()
+ sorted_partitions = [None] * k_partitions
+ for i, partition in enumerate(partitions):
+ assert len(partition) > 0, f"the {i}-th partition is empty"
+ for idx in partition:
+ seen_idx.add(idx)
+ sorted_partitions[i] = sorted(partition)
+ assert seen_idx == set(range(len(seqlen_list)))
+ return sorted_partitions
+
+ partitions = karmarkar_karp(seqlen_list=seqlen_list, k_partitions=k_partitions, equal_size=equal_size)
+ return _check_and_sort_partitions(partitions)
+
+
+def log_seqlen_unbalance(seqlen_list: List[int], partitions: List[List[int]], prefix):
+ # add some metrics of seqlen sum on dp ranks
+ k_partition = len(partitions)
+ # assert len(seqlen_list) % k_partition == 0
+ batch_size = len(seqlen_list) // k_partition
+ min_sum_seqlen = None
+ max_sum_seqlen = None
+ total_sum_seqlen = 0
+ for offset in range(0, len(seqlen_list), batch_size):
+ cur_sum_seqlen = sum(seqlen_list[offset:offset + batch_size])
+ if min_sum_seqlen is None or cur_sum_seqlen < min_sum_seqlen:
+ min_sum_seqlen = cur_sum_seqlen
+ if max_sum_seqlen is None or cur_sum_seqlen > max_sum_seqlen:
+ max_sum_seqlen = cur_sum_seqlen
+ total_sum_seqlen += cur_sum_seqlen
+
+ balanced_sum_seqlen_list = []
+ for partition in partitions:
+ cur_sum_seqlen_balanced = sum([seqlen_list[i] for i in partition])
+ balanced_sum_seqlen_list.append(cur_sum_seqlen_balanced)
+ # print("balanced_sum_seqlen_list: ", balanced_sum_seqlen_list)
+ min_sum_seqlen_balanced = min(balanced_sum_seqlen_list)
+ max_sum_seqlen_balanced = max(balanced_sum_seqlen_list)
+
+ return {
+ f'{prefix}/min': min_sum_seqlen,
+ f'{prefix}/max': max_sum_seqlen,
+ f'{prefix}/minmax_diff': max_sum_seqlen - min_sum_seqlen,
+ f'{prefix}/balanced_min': min_sum_seqlen_balanced,
+ f'{prefix}/balanced_max': max_sum_seqlen_balanced,
+ f'{prefix}/mean': total_sum_seqlen / len(partitions)
+ }
+
+
+def ceildiv(a, b):
+ return -(a // -b)
+
+
+def rearrange_micro_batches(batch: TensorDict, max_token_len, dp_group=None):
+ """Split the batch into a list of micro_batches, where the max_token_len is smaller than max_token_len
+ and the number of valid tokens in each micro batch is well balanced.
+ """
+ # this is per local micro_bsz
+ max_seq_len = batch['attention_mask'].shape[-1]
+ assert max_token_len >= max_seq_len, \
+ f'max_token_len must be greater than the sequence length. Got {max_token_len=} and {max_seq_len=}'
+
+ seq_len_effective: torch.Tensor = batch['attention_mask'].sum(dim=1)
+ total_seqlen = seq_len_effective.sum().item()
+ num_micro_batches = ceildiv(total_seqlen, max_token_len)
+ if dist.is_initialized():
+ num_micro_batches = torch.tensor([num_micro_batches], device='cuda')
+ dist.all_reduce(num_micro_batches, op=dist.ReduceOp.MAX, group=dp_group)
+ num_micro_batches = num_micro_batches.cpu().item()
+
+ seq_len_effective = seq_len_effective.tolist()
+ assert num_micro_batches <= len(seq_len_effective)
+
+ micro_bsz_idx = get_seqlen_balanced_partitions(seq_len_effective, num_micro_batches, equal_size=False)
+
+ micro_batches = []
+
+ for partition in micro_bsz_idx:
+ curr_micro_batch = []
+ for idx in partition:
+ curr_micro_batch.append(batch[idx:idx + 1])
+ curr_micro_batch = torch.cat(curr_micro_batch)
+
+ micro_batches.append(curr_micro_batch)
+
+ return micro_batches, micro_bsz_idx
+
+
+def get_reverse_idx(idx_map):
+ reverse_idx_map = copy.deepcopy(idx_map)
+
+ for i, idx in enumerate(idx_map):
+ reverse_idx_map[idx] = i
+
+ return reverse_idx_map
diff --git a/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py b/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b64b6623ac62b6b3f4288dccf8f5307fc87439c7
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/tokenizer.py
@@ -0,0 +1,58 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for tokenization."""
+import warnings
+
+__all__ = ['hf_tokenizer']
+
+
+def set_pad_token_id(tokenizer):
+ """Set pad_token_id to eos_token_id if it is None.
+
+ Args:
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer to be set.
+
+ """
+ if tokenizer.pad_token_id is None:
+ tokenizer.pad_token_id = tokenizer.eos_token_id
+ warnings.warn(f'tokenizer.pad_token_id is None. Now set to {tokenizer.eos_token_id}')
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ warnings.warn(f'tokenizer.pad_token is None. Now set to {tokenizer.eos_token}')
+
+
+def hf_tokenizer(name_or_path, correct_pad_token=True, correct_gemma2=True, **kwargs):
+ """Create a huggingface pretrained tokenizer.
+
+ Args:
+ name (str): The name of the tokenizer.
+ correct_pad_token (bool): Whether to correct the pad token id.
+ correct_gemma2 (bool): Whether to correct the gemma2 tokenizer.
+ **kwargs: The keyword arguments for the tokenizer.
+
+ Returns:
+ transformers.PreTrainedTokenizer: The pretrained tokenizer.
+
+ """
+ from transformers import AutoTokenizer
+ if correct_gemma2 and isinstance(name_or_path, str) and 'gemma-2-2b-it' in name_or_path:
+ # the EOS token in gemma2 is ambiguious, which may worsen RL performance.
+ # https://huggingface.co/google/gemma-2-2b-it/commit/17a01657f5c87135bcdd0ec7abb4b2dece04408a
+ warnings.warn('Found gemma-2-2b-it tokenizer. Set eos_token and eos_token_id to and 107.')
+ kwargs['eos_token'] = ''
+ kwargs['eos_token_id'] = 107
+ tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
+ if correct_pad_token:
+ set_pad_token_id(tokenizer)
+ return tokenizer
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py b/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb63df13b9c26802dff23c92ae8e36f5c23ae4fd
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/torch_dtypes.py
@@ -0,0 +1,82 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Adapted from Cruise.
+"""
+
+import torch
+
+from typing import Union
+
+HALF_LIST = [16, "16", "fp16", "float16"]
+FLOAT_LIST = [32, "32", "fp32", "float32"]
+BFLOAT_LIST = ["bf16", "bfloat16"]
+
+
+class PrecisionType(object):
+ """Type of precision used.
+
+ >>> PrecisionType.HALF == 16
+ True
+ >>> PrecisionType.HALF in (16, "16")
+ True
+ """
+
+ HALF = "16"
+ FLOAT = "32"
+ FULL = "64"
+ BFLOAT = "bf16"
+ MIXED = "mixed"
+
+ @staticmethod
+ def supported_type(precision: Union[str, int]) -> bool:
+ return any(x == precision for x in PrecisionType)
+
+ @staticmethod
+ def supported_types() -> list[str]:
+ return [x.value for x in PrecisionType]
+
+ @staticmethod
+ def is_fp16(precision):
+ return precision in HALF_LIST
+
+ @staticmethod
+ def is_fp32(precision):
+ return precision in FLOAT_LIST
+
+ @staticmethod
+ def is_bf16(precision):
+ return precision in BFLOAT_LIST
+
+ @staticmethod
+ def to_dtype(precision):
+ if precision in HALF_LIST:
+ return torch.float16
+ elif precision in FLOAT_LIST:
+ return torch.float32
+ elif precision in BFLOAT_LIST:
+ return torch.bfloat16
+ else:
+ raise RuntimeError(f"unexpected precision: {precision}")
+
+ @staticmethod
+ def to_str(precision):
+ if precision == torch.float16:
+ return 'fp16'
+ elif precision == torch.float32:
+ return 'fp32'
+ elif precision == torch.bfloat16:
+ return 'bf16'
+ else:
+ raise RuntimeError(f"unexpected precision: {precision}")
diff --git a/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py b/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d53ca7a4e40efc715ceba1f3a8c725c2fe256a0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/torch_functional.py
@@ -0,0 +1,492 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contain small torch utilities
+"""
+
+from typing import Dict, Union, List, Optional
+
+import os
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from tensordict import TensorDict
+from torch import nn
+
+try:
+ from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+ FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE = True
+except ImportError:
+ FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE = False
+
+
+def gather_from_labels(data, label):
+ """Gather the label from data. The value in label should be [0, vocab_size)
+
+ Args:
+ data: (..., vocab_size)
+ label (torch.IntTensor) : (...,)
+
+ Returns:
+
+ """
+
+ output = torch.gather(data, -1, label.unsqueeze(-1)).squeeze(-1)
+ return output
+
+
+def logprobs_from_logits(logits, labels):
+ """
+ See: https://github.com/pytorch/pytorch/issues/563#issuecomment-330103591
+ """
+ if FLAH_ATTN_CROSS_ENTROPY_LOSS_AVAILABLE:
+ batch_dim = logits.shape[:-1]
+ last_dim = logits.shape[-1]
+ logits = logits.reshape(-1, last_dim)
+ labels = labels.reshape(-1)
+ output = logprobs_from_logits_flash_attn(logits, labels)
+ output = output.view(*batch_dim)
+ else:
+ output = logprobs_from_logits_naive(logits, labels)
+ return output
+
+
+def logprobs_from_logits_flash_attn(logits, labels):
+ output = -cross_entropy_loss(logits, labels)[0]
+ return output
+
+
+def logprobs_from_logits_naive(logits, labels):
+ logp = F.log_softmax(logits, dim=-1)
+ logpy = gather_from_labels(logp, labels)
+ return logpy
+
+
+def logprobs_of_labels_v2(logits: torch.FloatTensor, labels):
+ """
+ A memory efficient implementation of logprobs_from_logits
+ """
+ assert logits.dtype == torch.float32, 'Using bf16 logits with logprobs_of_labels_v2 may lead to divergence'
+ logprobs_labels = torch.gather(logits, dim=-1, index=labels.unsqueeze(-1))
+ logprobs_labels = logprobs_labels - torch.logsumexp(logits, dim=-1, keepdim=True)
+ return logprobs_labels.squeeze(-1)
+
+
+def clip_by_value(x, tensor_min, tensor_max):
+ """
+ Tensor extenstion to torch.clamp
+ https://github.com/pytorch/pytorch/issues/2793#issuecomment-428784713
+ """
+ clipped = torch.max(torch.min(x, tensor_max), tensor_min)
+ return clipped
+
+
+def entropy_from_logits(logits: torch.Tensor):
+ """Calculate entropy from logits."""
+ pd = torch.nn.functional.softmax(logits, dim=-1)
+ entropy = torch.logsumexp(logits, dim=-1) - torch.sum(pd * logits, dim=-1)
+ return entropy
+
+
+def masked_sum(values, mask, axis=None):
+ """Compute mean of tensor with a masked values."""
+ return (values * mask).sum(axis=axis)
+
+
+def masked_mean(values, mask, axis=None):
+ """Compute mean of tensor with a masked values."""
+ return (values * mask).sum(axis=axis) / mask.sum(axis=axis)
+
+
+def masked_var(values, mask, unbiased=True):
+ """Compute variance of tensor with masked values."""
+ mean = masked_mean(values, mask)
+ centered_values = values - mean
+ variance = masked_mean(centered_values**2, mask)
+ if unbiased:
+ mask_sum = mask.sum()
+ if mask_sum == 0:
+ raise ValueError("At least one element in the mask has to be 1.")
+ # note that if mask_sum == 1, then there is a division by zero issue
+ # to avoid it you just need to use a larger minibatch_size
+ if mask_sum == 1:
+ raise ValueError("The sum of the mask is one, which can cause a division by zero.")
+ bessel_correction = mask_sum / (mask_sum - 1)
+ variance = variance * bessel_correction
+ return variance
+
+
+def masked_whiten(values, mask, shift_mean=True):
+ """Whiten values with masked values."""
+ mean, var = masked_mean(values, mask), masked_var(values, mask)
+ whitened = (values - mean) * torch.rsqrt(var + 1e-8)
+ if not shift_mean:
+ whitened += mean
+ return whitened
+
+
+def get_eos_mask(response_id: torch.Tensor, eos_token: int = 2, dtype=torch.int64):
+ '''
+ e.g. end of sentence token=1
+ response_id: [0, 0, 2, 42, 3, 5, 1, 0, 0]
+ eos_mask: [1, 1, 1, 1, 1, 1, 1, 0, 0]
+ '''
+ eos_mask = response_id.eq(eos_token).long()
+ eos_mask = (torch.cumsum(eos_mask, dim=1) - eos_mask).bool()
+ eos_mask = torch.logical_not(eos_mask).to(dtype)
+ return eos_mask
+
+
+def compute_grad_norm(model: nn.Module):
+ total_grad_square = 0
+ total_params = 0
+ for param in model.parameters():
+ if param.grad is not None:
+ total_grad_square += torch.sum(torch.square(param.grad.detach())).item()
+ return total_grad_square
+
+
+def broadcast_dict_tensor(tensors: Union[Dict[str, torch.Tensor], TensorDict], src, group):
+ """
+ TODO: optimize this. Technically, we only need one broadcast
+ """
+
+ for key in tensors.sorted_keys:
+ torch.distributed.broadcast(tensors[key], src=src, group=group, async_op=False)
+
+
+def allgather_dict_tensors(tensors: Union[Dict[str, torch.Tensor], TensorDict], size, group, dim=0):
+ """
+ TODO: optimize this.
+ - We can use async ops
+ - We can use only one allgather
+ Args:
+ tensors:
+ size:
+ group:
+
+ Returns:
+
+ """
+ if isinstance(tensors, TensorDict):
+ is_tensor_dict = True
+ tensors_as_dict = tensors.to_dict()
+ else:
+ tensors_as_dict = tensors
+ is_tensor_dict = False
+
+ output = {}
+ sorted_keys = sorted(tensors_as_dict.keys())
+ for key in sorted_keys:
+ val = tensors_as_dict[key]
+ output[key] = [torch.empty_like(val) for _ in range(size)]
+ torch.distributed.all_gather(output[key], val, group=group, async_op=False)
+ output[key] = torch.cat(output[key], dim=dim)
+
+ if is_tensor_dict:
+ output = TensorDict(source=output, batch_size=tensors.batch_size[0] * size)
+
+ return output
+
+
+def split_dict_tensor_into_batches(tensors: TensorDict, batch_size) -> List[TensorDict]:
+ assert tensors.batch_size[0] % batch_size == 0, \
+ f'input data batch size: {tensors.batch_size[0]}, split batch size: {batch_size}'
+ return tensors.split(batch_size)
+
+
+def pad_sequence_to_length(tensors, max_seq_len, pad_token_id, left_pad=False):
+ """
+ pad a 2D tensors (e.g. responses, logprobs) in the last dim to max_seq_length.
+ input shape: [bs, seq_length]
+ output shape: [bs, max_seq_length]
+ (0, max_seq_len - tensors.shape[-1]) means right pad to max_seq_length and no left pad
+ """
+ if tensors.shape[-1] >= max_seq_len:
+ return tensors
+ pad_tuple = (max_seq_len - tensors.shape[-1], 0) if left_pad else (0, max_seq_len - tensors.shape[-1])
+ return F.pad(tensors, pad_tuple, 'constant', pad_token_id)
+
+
+from transformers import PreTrainedTokenizer
+
+
+def tokenize_and_postprocess_data(prompt: str,
+ tokenizer: PreTrainedTokenizer,
+ max_length: int,
+ pad_token_id: int,
+ left_pad=True,
+ truncation='error'):
+ """
+ input_data is the output from tokenizer.
+ """
+ assert truncation in ['left', 'right', 'error']
+
+ input_data = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
+
+ input_ids = input_data['input_ids']
+ attention_mask = input_data['attention_mask']
+
+ assert input_ids.ndim == 2
+
+ sequence_length = input_ids.shape[-1]
+ if sequence_length < max_length:
+ input_ids = pad_sequence_to_length(input_ids,
+ max_seq_len=max_length,
+ pad_token_id=pad_token_id,
+ left_pad=left_pad)
+ attention_mask = pad_sequence_to_length(attention_mask,
+ max_seq_len=max_length,
+ pad_token_id=0,
+ left_pad=left_pad)
+ elif sequence_length > max_length:
+ if truncation == 'left':
+ # actually, left truncation may not be reasonable
+ input_ids = input_ids[:, -max_length:]
+ attention_mask = attention_mask[:, -max_length:]
+ elif truncation == 'right':
+ input_ids = input_ids[:, :max_length]
+ attention_mask = attention_mask[:, :max_length]
+ elif truncation == 'error':
+ raise NotImplementedError(f'{sequence_length=} is larger than {max_length=}')
+ else:
+ raise NotImplementedError(f'Unknown truncation method {truncation}')
+
+ return input_ids, attention_mask
+
+
+def remove_pad_token(input_ids: torch.Tensor, attention_mask: torch.Tensor):
+ """ Remove the pad token.
+
+ Args:
+ input_ids shape: [bs, seq_length]
+ attention_mask shape: [bs, seq_length]
+ Returns:
+ no_padding_batch(List[List[int]]): contains the rmpad token ids per query.
+ """
+ no_padding_batch = []
+ for ids, mask in zip(input_ids, attention_mask):
+ no_padding_batch.append((ids[len(ids) - mask.sum():]).cpu().numpy().tolist())
+ return no_padding_batch
+
+
+def log_probs_from_logits_response(input_ids, logits, response_length):
+ """Compute the response log_probs from full logits. Note that logits = model(input_ids)
+
+ Args:
+ input_ids: [batch_size, seqlen]
+ logits: [batch_size, seqlen, vocab_size]
+
+ Returns:
+ response_log_prob:
+ """
+ response_logits = logits[:, -response_length - 1:-1]
+ response = input_ids[:, -response_length:]
+ response_log_prob = logprobs_from_logits(logits=response_logits, labels=response)
+ return response_log_prob
+
+
+def log_probs_from_logits_response_rmpad(input_ids, attention_mask, logits_rmpad, response_length):
+ """Compute the log_probs from logits with rmpad logits and pad input. Note that
+ logits_rmpad = model(input_ids_rmpad). For each sentences, there is a shift between
+ logits and input_ids.
+ The reason for this function to is to compute logprobs_from_logits in rmpad mode because it is memory-intensive
+ for large vocab_size
+
+ Args:
+ input_ids: [batch_size, seqlen]
+ attention_mask: [batch_size, seqlen]
+ logits_rmpad: [total_nnz, vocab_size]
+ response_length: int
+ """
+ from flash_attn.bert_padding import pad_input, unpad_input
+
+ batch_size, seqlen = input_ids.shape
+ input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask=attention_mask)
+ input_ids_rmpad = input_ids_rmpad.squeeze(-1)
+ input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=0)
+ full_log_probs_rmpad = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled) # (total_nnz,)
+ full_output = pad_input(hidden_states=full_log_probs_rmpad.unsqueeze(-1),
+ indices=indices,
+ batch=batch_size,
+ seqlen=seqlen)
+ output = full_output.squeeze(-1)[:, -response_length - 1:-1] # [batch_size, response_length]
+ return output
+
+
+def log_probs_from_logits_all_rmpad(input_ids_rmpad, logits_rmpad, indices, batch_size, seqlen, response_length):
+ """Compute the log_probs from logits with rmpad input_ids and logits. Note that
+ logits_rmpad = model(input_ids_rmpad). For each sentences, there is a shift between
+ logits and input_ids.
+ The reason for this function to is to compute logprobs_from_logits in rmpad mode because it is memory-intensive
+ for large vocab_size
+
+ Args:
+ input_ids_rmpad: [1, total_nnz]
+ logits_rmpad: [total_nnz, vocab_size]
+ indices: [total_nnz]
+ batch_size: int
+ seqlen: int
+ response_length: int
+ """
+ from flash_attn.bert_padding import pad_input
+ input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # transpose back to [total_nnz, 1]
+ input_ids_rmpad = input_ids_rmpad.squeeze(-1)
+ input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=0)
+ full_log_probs_rmpad = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled) # (total_nnz,)
+ full_output = pad_input(hidden_states=full_log_probs_rmpad.unsqueeze(-1),
+ indices=indices,
+ batch=batch_size,
+ seqlen=seqlen)
+ output = full_output.squeeze(-1)[:, -response_length - 1:-1] # [batch_size, response_length]
+ return output
+
+
+from transformers.generation.logits_process import (TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper)
+
+
+def post_process_logits(input_ids, logits, temperature, top_k, top_p):
+ if temperature != 1.:
+ logits = logits.div_(temperature) # inplace operation to avoid OOM
+ # TODO: add them back
+ # if top_k is not None and top_k > 0:
+ # logits = TopKLogitsWarper(top_k=top_k)(input_ids, logits)
+ # if top_p is not None and top_p < 1.0 and top_p > 0.0:
+ # logits = TopPLogitsWarper(top_p=top_p)(input_ids, logits)
+ return logits
+
+
+"""
+Optimizer related
+"""
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+import math
+
+
+def get_cosine_schedule_with_warmup(
+ optimizer: Optimizer,
+ num_warmup_steps: int,
+ num_training_steps: int,
+ min_lr_ratio: float = 0.0,
+ num_cycles: float = 0.5,
+ last_epoch: int = -1,
+):
+ """
+ Create a schedule with a learning rate that decreases following the values of the cosine function between the
+ initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+ initial lr set in the optimizer.
+ Args:
+ optimizer (:class:`~torch.optim.Optimizer`):
+ The optimizer for which to schedule the learning rate.
+ num_warmup_steps (:obj:`int`):
+ The number of steps for the warmup phase.
+ num_training_steps (:obj:`int`):
+ The total number of training steps.
+ min_lr_ratio (:obj:`float`, `optional`, defaults to 0.0):
+ The minimum lr ratio w.r.t the maximum.
+ num_cycles (:obj:`float`, `optional`, defaults to 0.5):
+ The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+ following a half-cosine).
+ last_epoch (:obj:`int`, `optional`, defaults to -1):
+ The index of the last epoch when resuming training.
+ Return:
+ :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+ assert min_lr_ratio >= 0 and min_lr_ratio <= 1.
+ coef = (1 - min_lr_ratio) * 0.5
+ intercept = (1 + min_lr_ratio) * 0.5
+
+ def lr_lambda(current_step):
+ if current_step < num_warmup_steps:
+ return float(current_step) / float(max(1, num_warmup_steps))
+ progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+ x = math.cos(math.pi * float(num_cycles) * 2.0 * progress)
+ return max(0.0, x * coef + intercept)
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_constant_schedule_with_warmup(
+ optimizer: Optimizer,
+ num_warmup_steps: int,
+ last_epoch: int = -1,
+):
+
+ def lr_lambda(current_step):
+ return min(1, float(current_step) / float(max(1, num_warmup_steps)))
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype,
+ tgt_len=input_shape[-1]).to(inputs_embeds.device)
+ combined_attention_mask = (expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask +
+ combined_attention_mask)
+
+ return combined_attention_mask
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+def get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
diff --git a/code/RL_model/verl/Search-R1/verl/utils/tracking.py b/code/RL_model/verl/Search-R1/verl/utils/tracking.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1fbd6f330451b89286644e226fb743237bc436c
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/tracking.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A unified tracking interface that supports logging data to different backend
+"""
+import dataclasses
+from enum import Enum
+from functools import partial
+from pathlib import Path
+from typing import List, Union, Dict, Any
+
+
+class Tracking(object):
+ supported_backend = ['wandb', 'mlflow', 'console']
+
+ def __init__(self, project_name, experiment_name, default_backend: Union[str, List[str]] = 'console', config=None):
+ if isinstance(default_backend, str):
+ default_backend = [default_backend]
+ for backend in default_backend:
+ if backend == 'tracking':
+ import warnings
+ warnings.warn("`tracking` logger is deprecated. use `wandb` instead.", DeprecationWarning)
+ else:
+ assert backend in self.supported_backend, f'{backend} is not supported'
+
+ self.logger = {}
+
+ if 'tracking' in default_backend or 'wandb' in default_backend:
+ import wandb
+ import os
+ WANDB_API_KEY = os.environ.get("WANDB_API_KEY", None)
+ if WANDB_API_KEY:
+ wandb.login(key=WANDB_API_KEY)
+ wandb.init(project=project_name, name=experiment_name, config=config)
+ self.logger['wandb'] = wandb
+
+ if 'mlflow' in default_backend:
+ import mlflow
+ mlflow.start_run(run_name=experiment_name)
+ mlflow.log_params(_compute_mlflow_params_from_objects(config))
+ self.logger['mlflow'] = _MlflowLoggingAdapter()
+
+ if 'console' in default_backend:
+ from verl.utils.logger.aggregate_logger import LocalLogger
+ self.console_logger = LocalLogger(print_to_console=True)
+ self.logger['console'] = self.console_logger
+
+ def log(self, data, step, backend=None):
+ for default_backend, logger_instance in self.logger.items():
+ if backend is None or default_backend in backend:
+ logger_instance.log(data=data, step=step)
+
+
+class _MlflowLoggingAdapter:
+
+ def log(self, data, step):
+ import mlflow
+ mlflow.log_metrics(metrics=data, step=step)
+
+
+def _compute_mlflow_params_from_objects(params) -> Dict[str, Any]:
+ if params is None:
+ return {}
+
+ return _flatten_dict(_transform_params_to_json_serializable(params, convert_list_to_dict=True), sep='/')
+
+
+def _transform_params_to_json_serializable(x, convert_list_to_dict: bool):
+ _transform = partial(_transform_params_to_json_serializable, convert_list_to_dict=convert_list_to_dict)
+
+ if dataclasses.is_dataclass(x):
+ return _transform(dataclasses.asdict(x))
+ if isinstance(x, dict):
+ return {k: _transform(v) for k, v in x.items()}
+ if isinstance(x, list):
+ if convert_list_to_dict:
+ return {'list_len': len(x)} | {f'{i}': _transform(v) for i, v in enumerate(x)}
+ else:
+ return [_transform(v) for v in x]
+ if isinstance(x, Path):
+ return str(x)
+ if isinstance(x, Enum):
+ return x.value
+
+ return x
+
+
+def _flatten_dict(raw: Dict[str, Any], *, sep: str) -> Dict[str, Any]:
+ import pandas as pd
+ ans = pd.json_normalize(raw, sep=sep).to_dict(orient='records')[0]
+ assert isinstance(ans, dict)
+ return ans
diff --git a/code/RL_model/verl/Search-R1/verl/utils/ulysses.py b/code/RL_model/verl/Search-R1/verl/utils/ulysses.py
new file mode 100644
index 0000000000000000000000000000000000000000..c085becc591d29a9517966cdee601843bdf24371
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/utils/ulysses.py
@@ -0,0 +1,288 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for DeepSpeed Ulysses Sequence Parallelism.
+DeepSpeed Ulysses Paper: https://arxiv.org/abs/2309.14509
+Inspired from: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/sequence/layer.py
+"""
+from typing import Any, Optional, List, Tuple
+
+import torch
+from torch import Tensor
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+_ULYSSES_SEQUENCE_PARALLEL_GROUP = None
+
+
+def set_ulysses_sequence_parallel_group(group: dist.ProcessGroup):
+ """
+ Set ulysses sequence parallel process group.
+ """
+ global _ULYSSES_SEQUENCE_PARALLEL_GROUP
+ _ULYSSES_SEQUENCE_PARALLEL_GROUP = group
+
+
+def get_ulysses_sequence_parallel_group() -> Optional[dist.ProcessGroup]:
+ """
+ Get ulysses sequence parallel process group.
+ """
+ global _ULYSSES_SEQUENCE_PARALLEL_GROUP
+ return _ULYSSES_SEQUENCE_PARALLEL_GROUP
+
+
+def get_ulysses_sequence_parallel_world_size(group: ProcessGroup = None) -> int:
+ """
+ Get ulysses sequence parallel world size.
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ return dist.get_world_size(group) if group else 1
+
+
+def get_ulysses_sequence_parallel_rank(group: ProcessGroup = None) -> int:
+ """
+ Get ulysses sequence parallel rank.
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ return dist.get_rank(group) if group else 0
+
+
+def gather_seq_scatter_heads(
+ x: Tensor,
+ seq_dim: int,
+ head_dim: int,
+ unpadded_dim_size: int = 0,
+ group: ProcessGroup = None,
+) -> Tensor:
+ """
+ A func to sync embedding input with alltoall in sequence parallel
+ gather sequence dimension and scatter head dim:
+ e.g. seq_dim: 1, head_dim: 2
+ [bsz, seq/n, h, ...] -> [bsz, seq, h/n, ...]
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ if not group:
+ return x
+ sp_world = get_ulysses_sequence_parallel_world_size(group)
+ x = SeqAllToAll.apply(group, x, head_dim, seq_dim)
+ if unpadded_dim_size and unpadded_dim_size % sp_world != 0:
+ padding_size = x.size(seq_dim) - unpadded_dim_size
+ x = _unpad_tensor(x, seq_dim, padding_size)
+ return x
+
+
+def gather_heads_scatter_seq(x: Tensor, head_dim: int, seq_dim: int, group: ProcessGroup = None) -> Tensor:
+ """
+ A func to sync attention result with alltoall in sequence parallel
+ gather head dimension and scatter seq dim:
+ e.g. seq_dim: 1, head_dim: 2
+ [bsz, seq, h/n, ...] -> [bsz, seq/n, h, ...]
+ """
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ if not group:
+ return x
+ dim_size = x.size(seq_dim)
+ sp_world = get_ulysses_sequence_parallel_world_size(group)
+ if dim_size % sp_world != 0:
+ padding_size = sp_world - (dim_size % sp_world)
+ x = _pad_tensor(x, seq_dim, padding_size)
+ return SeqAllToAll.apply(group, x, seq_dim, head_dim, False)
+
+
+def _pad_tensor(x: Tensor, dim: int, padding_size: int) -> Tensor:
+ shape = list(x.shape)
+ shape[dim] = padding_size
+ pad = torch.zeros(shape, dtype=x.dtype, device=x.device)
+ return torch.cat([x, pad], dim=dim)
+
+
+def _unpad_tensor(x: Tensor, dim: int, padding_size: int) -> Tensor:
+ slc = [slice(None)] * len(x.shape)
+ slc[dim] = slice(0, -padding_size)
+ return x[slc]
+
+
+def slice_input_tensor(x: Tensor, dim: int, padding: bool = True, group: ProcessGroup = None) -> Tensor:
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ sp_world_size = dist.get_world_size(group)
+ sp_rank = get_ulysses_sequence_parallel_rank()
+ dim_size = x.size(dim)
+ # pad before slice
+ if padding and dim_size % sp_world_size:
+ padding_size = sp_world_size - (dim_size % sp_world_size)
+ x = _pad_tensor(x, dim, padding_size)
+ # slice the input tensor
+ parts = x.size(dim) // sp_world_size
+ slc = [slice(None)] * len(x.shape)
+ slc[dim] = slice(sp_rank * parts, (sp_rank + 1) * parts)
+ return x[slc].contiguous()
+
+
+def all_to_all_tensor(
+ local_input: Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+ group: Optional[dist.ProcessGroup] = None,
+ async_op: bool = False,
+):
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ seq_world_size = dist.get_world_size(group)
+ input_list = [t.contiguous() for t in torch.tensor_split(local_input, seq_world_size, scatter_dim)]
+ output_list = [torch.empty_like(input_list[0]) for _ in range(seq_world_size)]
+ comm = dist.all_to_all(output_list, input_list, group=group, async_op=async_op)
+ if async_op:
+
+ def wait():
+ comm.wait()
+ return torch.cat(output_list, dim=gather_dim).contiguous()
+
+ return wait
+ return torch.cat(output_list, dim=gather_dim).contiguous()
+
+
+def all_gather_tensor(local_tensor: Tensor, group: Optional[dist.ProcessGroup] = None, async_op: bool = False):
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ sp_world_size = dist.get_world_size(group=group)
+ output_shape = list(local_tensor.shape)
+ output_shape[0] = output_shape[0] * sp_world_size
+ output = torch.empty(output_shape, dtype=local_tensor.dtype, device=local_tensor.device)
+ dist.all_gather_into_tensor(output, local_tensor, group=group, async_op=async_op)
+ return output
+
+
+class SeqAllToAll(torch.autograd.Function):
+
+ @staticmethod
+ def forward(
+ ctx: Any,
+ group: dist.ProcessGroup,
+ local_input: Tensor,
+ scatter_dim: int,
+ gather_dim: int,
+ async_op: bool = False,
+ ) -> Tensor:
+ ctx.group = group
+ ctx.scatter_dim = scatter_dim
+ ctx.gather_dim = gather_dim
+ ctx.async_op = async_op
+ return all_to_all_tensor(local_input, scatter_dim, gather_dim, group, async_op)
+
+ @staticmethod
+ def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+ if ctx.async_op:
+ input_t = torch.cat(grad_output[1:], dim=ctx.gather_dim).contiguous()
+ else:
+ input_t = grad_output[0]
+ return (
+ None,
+ all_to_all_tensor(input_t, ctx.gather_dim, ctx.scatter_dim, ctx.group, False),
+ None,
+ None,
+ None,
+ None,
+ )
+
+
+class Gather(torch.autograd.Function):
+
+ @staticmethod
+ def forward(ctx: Any,
+ group: dist.ProcessGroup,
+ local_tensor: Tensor,
+ gather_dim: int,
+ grad_scaler: bool = True,
+ async_op=False) -> Tensor:
+ ctx.group = group
+ ctx.gather_dim = gather_dim
+ ctx.grad_scaler = grad_scaler
+ ctx.async_op = async_op
+
+ sp_world_size = dist.get_world_size(group=group)
+ ctx.sp_world_size = sp_world_size
+
+ sp_rank = dist.get_rank(group=group)
+ ctx.sp_rank = sp_rank
+
+ local_shape = list(local_tensor.size())
+ split_size = local_shape[0]
+ part_size = local_shape[gather_dim] # store original size
+ ctx.part_size = part_size
+
+ output = all_gather_tensor(local_tensor, group, async_op)
+ return torch.cat(output.split(split_size, dim=0), dim=gather_dim)
+
+ @staticmethod
+ def backward(ctx: Any, grad_output: Tensor) -> Any:
+ if ctx.grad_scaler:
+ grad_output = grad_output * ctx.sp_world_size
+ return (None, grad_output.split(ctx.part_size,
+ dim=ctx.gather_dim)[ctx.sp_rank].contiguous(), None, None, None, None)
+
+
+def gather_outpus_and_unpad(x: Tensor,
+ gather_dim: int,
+ unpad_dim: int = None,
+ padding_size: int = 0,
+ grad_scaler: bool = True,
+ group: Optional[dist.ProcessGroup] = None):
+ group = get_ulysses_sequence_parallel_group() if group is None else group
+ sp_size = get_ulysses_sequence_parallel_world_size()
+ if group == None:
+ return x
+ x = Gather.apply(group, x, gather_dim, grad_scaler)
+ if unpad_dim is not None:
+ assert isinstance(padding_size, int), 'padding size is not given or is not an integer'
+ if padding_size == 0:
+ return x
+ x = _unpad_tensor(x, unpad_dim, padding_size)
+ return x
+
+
+def ulysses_pad_and_slice_inputs(input_ids_rmpad: torch.Tensor,
+ position_ids_rmpad: Optional[torch.Tensor] = None,
+ sp_size: int = 1):
+ """
+ Pad and slice input_ids to be divisible by sp_size
+ Pad position_ids to be divisible by sp_size.
+
+ Note both input_ids_rmpad and position_ids_rmpad will be padded,
+ but only input_ids will be sliced.
+
+ The is the utility of pre-forward for ulysses sequence parallelism
+
+ Args:
+ input_ids_rmpad: shape of [bsz, seqlen]
+ position_ids_rmpad: shape of [bsz, seqlen], where bsz must be 1
+ sp_size (int): ulysses sequence parallelism size
+
+ Returns:
+ torch.Tensor: padded and sliced input_ids
+ torch.Tensor: padded and sliced position_ids
+ int: pad size
+ """
+ if position_ids_rmpad is not None:
+ assert position_ids_rmpad.size(0) == 1
+ assert input_ids_rmpad.size(1) == position_ids_rmpad.size(1)
+ if sp_size <= 1:
+ return input_ids_rmpad, position_ids_rmpad, 0
+ _, total_seq_len = input_ids_rmpad.shape
+ pad_size = (sp_size - total_seq_len % sp_size) % sp_size
+ if pad_size > 0:
+ input_ids_rmpad = torch.nn.functional.pad(input_ids_rmpad, (0, pad_size), value=0)
+ if position_ids_rmpad is not None:
+ pad_pos_ids = torch.arange(pad_size, device=position_ids_rmpad.device).unsqueeze(0)
+ position_ids_rmpad = torch.cat((position_ids_rmpad, pad_pos_ids), dim=-1)
+ # we don't need to slice position ids
+ input_ids_rmpad = slice_input_tensor(input_ids_rmpad, dim=1, padding=False)
+ return input_ids_rmpad, position_ids_rmpad, pad_size
diff --git a/code/RL_model/verl/Search-R1/verl/version/version b/code/RL_model/verl/Search-R1/verl/version/version
new file mode 100644
index 0000000000000000000000000000000000000000..ceab6e11ece0bcec917c12e11d350946f085d549
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/version/version
@@ -0,0 +1 @@
+0.1
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/workers/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce90c5eb352d85c59105c0dc85b5f1dd576f095
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/RL_model/verl/Search-R1/verl/workers/actor/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/actor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a1404e17695436516c55794f9094c094dba61ce
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/actor/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BasePPOActor
+from .dp_actor import DataParallelPPOActor
+
+__all__ = ["BasePPOActor", "DataParallelPPOActor"]
diff --git a/code/RL_model/verl/Search-R1/verl/workers/actor/base.py b/code/RL_model/verl/Search-R1/verl/workers/actor/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..144f0b90ef1efa77e5f1d4d26a07291ea89990cf
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/actor/base.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The base class for Actor
+"""
+from abc import ABC, abstractmethod
+from typing import Iterable, Dict
+
+from verl import DataProto
+import torch
+
+__all__ = ['BasePPOActor']
+
+
+class BasePPOActor(ABC):
+
+ def __init__(self, config):
+ """The base class for PPO actor
+
+ Args:
+ config (DictConfig): a config passed to the PPOActor. We expect the type to be
+ DictConfig (https://omegaconf.readthedocs.io/), but it can be any namedtuple in general.
+ """
+ super().__init__()
+ self.config = config
+
+ @abstractmethod
+ def compute_log_prob(self, data: DataProto) -> torch.Tensor:
+ """Compute logits given a batch of data.
+
+ Args:
+ data (DataProto): a batch of data represented by DataProto. It must contain key ```input_ids```,
+ ```attention_mask``` and ```position_ids```.
+
+ Returns:
+ DataProto: a DataProto containing the key ```log_probs```
+
+
+ """
+ pass
+
+ @abstractmethod
+ def update_policy(self, data: DataProto) -> Dict:
+ """Update the policy with an iterator of DataProto
+
+ Args:
+ data (DataProto): an iterator over the DataProto that returns by
+ ```make_minibatch_iterator```
+
+ Returns:
+ Dict: a dictionary contains anything. Typically, it contains the statistics during updating the model
+ such as ```loss```, ```grad_norm```, etc,.
+
+ """
+ pass
diff --git a/code/RL_model/verl/Search-R1/verl/workers/actor/dp_actor.py b/code/RL_model/verl/Search-R1/verl/workers/actor/dp_actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4717efc03afabaf4a9b1168ebdd0a8d465644b32
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/actor/dp_actor.py
@@ -0,0 +1,290 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Single Process Actor
+"""
+
+import itertools
+from typing import Iterable, Tuple
+
+import torch
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+from verl import DataProto
+from verl.trainer.ppo import core_algos
+from verl.workers.actor import BasePPOActor
+from verl.utils.py_functional import append_to_dict
+from verl.utils.torch_functional import logprobs_from_logits, masked_mean
+from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad
+from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx
+import verl.utils.torch_functional as verl_F
+
+from flash_attn.bert_padding import pad_input, unpad_input, rearrange, index_first_axis
+
+__all__ = ['DataParallelPPOActor']
+
+
+class DataParallelPPOActor(BasePPOActor):
+
+ def __init__(
+ self,
+ config,
+ actor_module: nn.Module,
+ actor_optimizer: torch.optim.Optimizer = None,
+ ):
+ """When optimizer is None, it is Reference Policy"""
+ super().__init__(config)
+ self.actor_module = actor_module
+ self.actor_optimizer = actor_optimizer
+ self.use_remove_padding = self.config.get('use_remove_padding', False)
+ print(f'Actor use_remove_padding={self.use_remove_padding}')
+ self.ulysses_sequence_parallel_size = self.config.ulysses_sequence_parallel_size
+ self.use_ulysses_sp = self.ulysses_sequence_parallel_size > 1
+
+ self.compute_entropy_from_logits = torch.compile(verl_F.entropy_from_logits, dynamic=True)
+
+ def _forward_micro_batch(self, micro_batch, temperature) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Returns:
+ entropy: # (bs, response_len)
+ log_probs: # (bs, response_len)
+ """
+ response_length = micro_batch['responses'].size(-1)
+ with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+ input_ids = micro_batch['input_ids']
+ batch_size, seqlen = input_ids.shape
+ attention_mask = micro_batch['attention_mask']
+ position_ids = micro_batch['position_ids']
+
+ if self.use_remove_padding:
+ input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+ attention_mask) # input_ids_rmpad (total_nnz, ...)
+ input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
+
+ # unpad the position_ids to align the rotary
+ position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+ indices).transpose(0, 1)
+
+ # for compute the log_prob
+ input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=1) # (1, total_nnz)
+
+ # pad and slice the inputs if sp > 1
+ if self.use_ulysses_sp:
+ input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs(input_ids_rmpad, \
+ position_ids_rmpad, \
+ sp_size=self.ulysses_sequence_parallel_size)
+ input_ids_rmpad_rolled, _, _ = ulysses_pad_and_slice_inputs(input_ids_rmpad_rolled, None,
+ self.ulysses_sequence_parallel_size)
+
+ input_ids_rmpad_rolled = input_ids_rmpad_rolled.squeeze(0) # ((total_nnz / sp) + pad)
+
+ # only pass input_ids and position_ids to enable flash_attn_varlen
+ output = self.actor_module(input_ids=input_ids_rmpad,
+ attention_mask=None,
+ position_ids=position_ids_rmpad,
+ use_cache=False) # prevent model thinks we are generating
+ logits_rmpad = output.logits.squeeze(0) # (total_nnz, vocab_size)
+
+ logits_rmpad.div_(temperature)
+
+ # compute entropy
+ entropy_rmpad = self.compute_entropy_from_logits(logits_rmpad) # ((total_nnz / sp) + pad)
+
+ # if use_sp: ((total_nnz / sp) + pad) ; if not use_sp: (batch, seqlen)
+ log_probs = logprobs_from_logits(logits=logits_rmpad, labels=input_ids_rmpad_rolled)
+
+ # gather log_prob if sp > 1
+ if self.use_ulysses_sp:
+ # gather and unpad for the ulysses sp
+ log_probs = gather_outpus_and_unpad(log_probs, gather_dim=0, unpad_dim=0, padding_size=pad_size)
+ entropy_rmpad = gather_outpus_and_unpad(entropy_rmpad,
+ gather_dim=0,
+ unpad_dim=0,
+ padding_size=pad_size)
+ # pad back to (bsz, seqlen)
+ full_entropy = pad_input(hidden_states=entropy_rmpad.unsqueeze(-1),
+ indices=indices,
+ batch=batch_size,
+ seqlen=seqlen)
+ full_log_probs = pad_input(hidden_states=log_probs.unsqueeze(-1),
+ indices=indices,
+ batch=batch_size,
+ seqlen=seqlen)
+
+ # only return response part:
+ entropy = full_entropy.squeeze(-1)[:, -response_length - 1:-1] # (bsz, response_length)
+ log_probs = full_log_probs.squeeze(-1)[:, -response_length - 1:-1] # (bsz, response_length)
+
+ else: # not using rmpad and no ulysses sp
+ output = self.actor_module(input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ use_cache=False) # prevent model thinks we are generating
+ logits = output.logits
+ logits.div_(temperature)
+ logits = logits[:, -response_length - 1:-1] # (bsz, response_length)
+ log_probs = logprobs_from_logits(logits, micro_batch['responses'])
+ entropy = verl_F.entropy_from_logits(logits) # (bsz, response_length)
+
+ return entropy, log_probs
+
+ def _optimizer_step(self):
+ assert self.config.grad_clip is not None
+
+ if isinstance(self.actor_module, FSDP):
+ grad_norm = self.actor_module.clip_grad_norm_(max_norm=self.config.grad_clip)
+ else:
+ grad_norm = torch.nn.utils.clip_grad_norm_(self.actor_module.parameters(), max_norm=self.config.grad_clip)
+ self.actor_optimizer.step()
+ return grad_norm
+
+ def compute_log_prob(self, data: DataProto) -> torch.Tensor:
+ """Compute the log probability of the responses given input_ids, attention_mask and position_ids
+
+ Args:
+ data (DataProto): a DataProto containing keys
+
+ ``input_ids``: tensor of shape [batch_size, sequence_length]. torch.int64. Note that input_ids is the
+ concatenation of prompt and response. Note that ``sequence_length = prompt_length + response_length``.
+
+ ``attention_mask``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+ ``position_ids``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+ ``responses``: tensor of shape [batch_size, response_length]. torch.int64.
+
+ Returns:
+ torch.Tensor: the log_prob tensor
+ """
+ # set to eval
+ self.actor_module.eval()
+
+ micro_batch_size = data.meta_info['micro_batch_size']
+ temperature = data.meta_info['temperature'] # temperature must be in the data.meta_info to avoid slient error
+ use_dynamic_bsz = data.meta_info['use_dynamic_bsz']
+
+ select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids']
+ batch = data.select(batch_keys=select_keys).batch
+
+ if use_dynamic_bsz:
+ # split using dynamic bsz
+ max_token_len = data.meta_info['max_token_len'] * self.ulysses_sequence_parallel_size
+ micro_batches, indices = rearrange_micro_batches(batch=batch, max_token_len=max_token_len)
+ else:
+ micro_batches = batch.split(micro_batch_size)
+
+ log_probs_lst = []
+ for micro_batch in micro_batches:
+ with torch.no_grad():
+ _, log_probs = self._forward_micro_batch(micro_batch, temperature=temperature)
+ log_probs_lst.append(log_probs)
+ log_probs = torch.concat(log_probs_lst, dim=0)
+
+ if use_dynamic_bsz:
+ indices = list(itertools.chain.from_iterable(indices))
+ assert len(indices) == log_probs.size(0), f"{len(indices)} vs. {log_probs.size()}"
+ revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
+ log_probs = log_probs[revert_indices]
+
+ return log_probs
+
+ def update_policy(self, data: DataProto):
+ # make sure we are in training mode
+ self.actor_module.train()
+
+ assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0
+ self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size
+ temperature = data.meta_info['temperature'] # temperature must be in the data.meta_info to avoid slient error
+
+ select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids', 'old_log_probs', 'advantages']
+ if self.config.state_masking:
+ select_keys.append('loss_mask')
+ if self.config.use_kl_loss:
+ select_keys.append('ref_log_prob')
+ batch = data.select(batch_keys=select_keys).batch
+
+ # Split to make minibatch iterator for updating the actor
+ # See PPO paper for details. https://arxiv.org/abs/1707.06347
+ dataloader = batch.split(self.config.ppo_mini_batch_size)
+
+ metrics = {}
+ for batch_idx, data in enumerate(dataloader):
+ # split batch into micro_batches
+ mini_batch = data
+ if self.config.use_dynamic_bsz:
+ max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
+ micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len)
+ else:
+ # split batch into micro_batches
+ micro_batches = mini_batch.split(self.config.ppo_micro_batch_size)
+
+ self.actor_optimizer.zero_grad()
+
+ for data in micro_batches:
+ data = data.cuda() # actor device is cpu when using offload
+ responses = data['responses']
+ response_length = responses.size(1)
+ attention_mask = data['attention_mask']
+ response_mask = attention_mask[:, -response_length:]
+ if self.config.state_masking:
+ response_mask = data['loss_mask']
+ old_log_prob = data['old_log_probs']
+ advantages = data['advantages']
+
+ clip_ratio = self.config.clip_ratio
+ entropy_coeff = self.config.entropy_coeff
+
+ # all return: (bsz, response_length)
+ entropy, log_prob = self._forward_micro_batch(micro_batch=data, temperature=temperature)
+
+ pg_loss, pg_clipfrac, ppo_kl = core_algos.compute_policy_loss(old_log_prob=old_log_prob,
+ log_prob=log_prob,
+ advantages=advantages,
+ eos_mask=response_mask,
+ cliprange=clip_ratio)
+ # compute entropy loss from entropy
+ entropy_loss = verl_F.masked_mean(entropy, response_mask)
+
+ # compute policy loss
+ policy_loss = pg_loss - entropy_loss * entropy_coeff
+
+ if self.config.use_kl_loss:
+ ref_log_prob = data['ref_log_prob']
+ # compute kl loss
+ kld = core_algos.kl_penalty(logprob=log_prob,
+ ref_logprob=ref_log_prob,
+ kl_penalty=self.config.kl_loss_type)
+ kl_loss = masked_mean(kld, response_mask)
+
+ policy_loss = policy_loss + kl_loss * self.config.kl_loss_coef
+ metrics['actor/kl_loss'] = kl_loss.detach().item()
+ metrics['actor/kl_coef'] = self.config.kl_loss_coef
+
+ loss = policy_loss / self.gradient_accumulation
+ loss.backward()
+
+ data = {
+ 'actor/entropy_loss': entropy_loss.detach().item(),
+ 'actor/pg_loss': pg_loss.detach().item(),
+ 'actor/pg_clipfrac': pg_clipfrac.detach().item(),
+ 'actor/ppo_kl': ppo_kl.detach().item(),
+ }
+ append_to_dict(metrics, data)
+
+ grad_norm = self._optimizer_step()
+ data = {'actor/grad_norm': grad_norm.detach().item()}
+ append_to_dict(metrics, data)
+ self.actor_optimizer.zero_grad()
+ return metrics
diff --git a/code/RL_model/verl/Search-R1/verl/workers/actor/megatron_actor.py b/code/RL_model/verl/Search-R1/verl/workers/actor/megatron_actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e674a28f6bbafabbfdb7b3c84e6d92833d1d8166
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/actor/megatron_actor.py
@@ -0,0 +1,368 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Megatron Actor.
+In megatron actor, the differences are:
+1. We only make minibatch
+
+Note that our model doesn't have to be `MegatronModule` because we don't share embedding in the last layer
+"""
+
+from functools import partial
+from typing import Iterable, Dict
+
+import torch
+from torch import nn
+import torch.distributed
+# from megatron import get_args
+from megatron.optimizer import DistributedOptimizer
+from verl.utils.megatron.optimizer_config import OptimizerConfig
+from megatron.core import parallel_state as mpu
+from megatron.core import ModelParallelConfig
+from megatron.core.pipeline_parallel import get_forward_backward_func
+# from megatron.core.optimizer import DistributedOptimizer
+
+from omegaconf import OmegaConf
+from verl.utils.megatron.tensor_parallel import vocab_parallel_compute_entropy_loss, vocab_parallel_log_probs_from_logits
+from verl.utils.megatron.pipeline_parallel import (compute_transformers_input_shapes, make_batch_generator)
+from verl import DataProto
+from verl.trainer.ppo import core_algos
+from verl.workers.actor import BasePPOActor
+from verl.utils.py_functional import append_to_dict
+from verl.utils.torch_functional import logprobs_from_logits, broadcast_dict_tensor, split_dict_tensor_into_batches
+
+__all__ = ['MegatronPPOActor']
+
+
+class MegatronPPOActor(BasePPOActor):
+
+ def __init__(self, config, model_config, megatron_config: ModelParallelConfig, actor_module: nn.ModuleList,
+ actor_optimizer: DistributedOptimizer, actor_optimizer_config: OptimizerConfig):
+ """MeagtronPPOActor class. This class implements the simple PPO logics when the model is built with Megatron.
+
+ Args:
+ config (OmegaConf): the basic config that contains the hyper-parameters of PPO Actor. It must contain
+
+ ``ppo_micro_batch_size``: minibatch size when updating ppo.
+
+ ``ppo_mini_batch_size``: minibatch size when updating ppo using the batch data.
+
+ ``ppo_epochs``: number of epochs to update the actor using the batch data.
+
+ ``shuffle``: whether to shuffle the data after each ppo epoch.
+
+ ``clip_ratio``: clip ratio of the ppo algorithm. See https://arxiv.org/abs/1707.06347.
+
+ ``entropy_coeff``: entropy coefficient of the PPO loss. See https://arxiv.org/abs/1707.06347.
+ model_config (OmegaConf): model configuration. It must contains ``model_config.vocab_size`` and
+ ``model_config.hidden_size``
+ megatron_config (OmegaConf): megatron configuration. It must contains
+
+ ``sequence_parallel_enabled``: whether the sequence parallel is enabled.
+
+ ``param_dtype``: the dtype of the parameters.
+
+ ``virtual_pipeline_model_parallel_size``: virtual pipeline model parallel size. a.k.a number of chunks in each pp stage.
+ actor_module (nn.ModuleList): actor module is a ModuleList that contains a list of nn.Module in this pp stage.
+ each nn.Module in this rank holds a vpp module chunk. See https://arxiv.org/pdf/2104.04473.pdf for more details.
+ The actor module has some constraints to follow in order to use the updating logics implemented here
+
+ 1. It must implement unpad_input before any computation and pad_input after all the computation. Remove padding is an
+ optimization that removes the padding tokens. See unpad_input and pad_input function in flash-attn
+ (https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py).
+
+ 2. Each pp stage must return the hidden state with the same shape [total_nnz, 1, hidden_size],
+ where total_nnz is the number of valid tokens in this batch. If sequence parallel is enabled, the size
+ of the hidden state is [total_nnz // tp, 1, hidden_size].
+ actor_optimizer (DistributedOptimizer): currently, we only support DistributedOptimizer in Megatron. It implements
+ zero1 optimizer that shards the optimizer state across dp ranks.
+
+ >>> def megatron_actor_model_provider(pre_process, post_process):
+ >>> vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank()
+ >>> parallel_model = ParallelMistralForCausalLMRmPadPP(config=actor_model_config,
+ >>> megatron_config=megatron_config,
+ >>> pre_process=pre_process,
+ >>> post_process=post_process).cuda()
+ >>> return parallel_model
+ >>> from megatron.training import get_model
+ >>> from megatron.optimizer import get_megatron_optimizer
+ >>> actor_module = get_model(megatron_actor_model_provider, wrap_with_ddp=True)
+ >>> actor_module = nn.ModuleList(actor_module)
+ >>> actor_optimizer = get_megatron_optimizer(actor_module)
+ >>> actor = MegatronPPOActor(config=config,
+ >>> model_config=actor_model_config,
+ >>> megatron_config=megatron_config,
+ >>> actor_module=actor_module,
+ >>> actor_optimizer=actor_optimizer)
+ """
+ super().__init__(config)
+ self.model_config = model_config
+ self.megatron_config = megatron_config
+ # self.megatron_args = get_args()
+ self.actor_module = actor_module
+ self.actor_optimizer: DistributedOptimizer = actor_optimizer
+ self.actor_optimizer_config = actor_optimizer_config
+
+ self.optimizer_step_args = OmegaConf.create({
+ 'skip_grad': None,
+ 'overlap_dp_param_comm': False,
+ 'overlap_dp_grad_comm': False,
+ 'gradient_accumulation_steps': 1,
+ 'sequence_parallel': self.megatron_config.sequence_parallel,
+ 'DDP_impl': 'local',
+ 'layernorm_allreduce_bucket_threshold': 0,
+ 'pipeline_model_parallel_split_rank': None,
+ 'reduce_grads_use_alltoall': False
+ })
+
+ def compute_log_prob(self, data: DataProto) -> torch.Tensor:
+ """Compute the log probability of the responses given input_ids, attention_mask and position_ids
+
+ Args:
+ data (DataProto): a DataProto containing keys
+
+ ``input_ids``: tensor of shape [batch_size, sequence_length]. torch.int64. Note that input_ids is the
+ concatenation of prompt and response. Note that ``sequence_length = prompt_length + response_length``.
+
+ ``attention_mask``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+ ``position_ids``: tensor of shape [batch_size, sequence_length]. torch.int64.
+
+ ``responses``: tensor of shape [batch_size, response_length]. torch.int64.
+
+ Returns:
+ DataProto: torch.Tensor: the log_prob tensor
+ """
+ data.batch = data.batch.contiguous()
+
+ def compute_logprobs_fn(output, data):
+ response = data['responses']
+ response_length = response.size(1)
+ logits = output['logits']
+ logits = logits[:, -response_length - 1:-1]
+ log_probs = vocab_parallel_log_probs_from_logits(logits, response)
+ return {'log_probs': log_probs}
+
+ # We make recompute_old_log_prob by default here.
+ # TODO (zhangchi.usc1992): actually, this function should only return log_prob and this logic should be handled by user outside
+ recompute_old_log_prob = self.config.get('recompute_old_log_prob', True)
+
+ if recompute_old_log_prob or 'old_log_probs' not in data.batch.keys():
+ select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids']
+ batch = data.select(batch_keys=select_keys).batch
+ input_ids = batch['input_ids']
+ batch_size = input_ids.size(0)
+ response = batch['responses']
+ response_length = response.size(1)
+ with torch.no_grad():
+ output = self.forward_backward_batch(data, forward_only=True, post_process_fn=compute_logprobs_fn)
+ if mpu.is_pipeline_last_stage(ignore_virtual=True):
+ # only on last rank. It should be on every tp rank
+ log_probs = torch.cat([o['log_probs'] for o in output], dim=0) # (bs, seq_size)
+ log_probs = log_probs.to(torch.float32)
+ else:
+ log_probs = torch.empty(size=(batch_size, response_length),
+ dtype=torch.float32,
+ device=input_ids.device)
+
+ # broadcast across pp ranks
+ torch.distributed.broadcast(tensor=log_probs,
+ src=mpu.get_pipeline_model_parallel_last_rank(),
+ group=mpu.get_pipeline_model_parallel_group(),
+ async_op=False)
+
+ # add empty cache after each compute
+ torch.cuda.empty_cache()
+
+ return log_probs
+
+ def make_minibatch_iterator(self, data: DataProto) -> Iterable[DataProto]:
+ """Make minibatch iterator for updating the actor
+
+ Args:
+ data (DataProto): a DataProto containing keys
+
+ ``input_ids``: tensor of shape [batch_size, sequence_length]. torch.int64, where ``sequence_length = prompt_length + response_length``
+
+ ``attention_mask``: tensor of shape [batch_size, sequence_length]. torch.int64
+
+ ``position_ids``: tensor of shape [batch_size, sequence_length]. torch.int64
+
+ ``responses``: tensor of shape [batch_size, response_length]. torch.int64. Note that responses = input_ids[:, -response_length:]
+
+ ``old_log_probs``: tensor of shape [batch_size, response_length]. torch.float32. The log probability of responses.
+
+ ``advantages``: tensor of shape [batch_size, response_length]. torch.float32. The advantages of responses.
+ See PPO paper for details. https://arxiv.org/abs/1707.06347
+
+ Returns:
+
+ """
+ select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids', 'old_log_probs', 'advantages']
+ data = data.select(batch_keys=select_keys)
+ return data.make_iterator(mini_batch_size=self.config.ppo_mini_batch_size,
+ epochs=self.config.ppo_epochs,
+ dataloader_kwargs={'shuffle': self.config.shuffle})
+
+ def forward_backward_batch(self, data: DataProto, forward_only=False, post_process_fn=None):
+ """
+ We assume:
+ - The model takes input: (input_ids, attention_mask, position_ids). No rmpad for the input
+ - The communication shape is (total_nnz_pad_to_sp // tp_size, 1, hidden_size) if sequence parallel is enabled
+ """
+ # broadcast from last pp rank to all other pp ranks
+ # TODO: actually, we just need to control the sampling order.
+ broadcast_dict_tensor(data.batch,
+ src=mpu.get_pipeline_model_parallel_last_rank(),
+ group=mpu.get_pipeline_model_parallel_group())
+ # split into micro-batches
+ data.batch['attention_mask'] = data.batch['attention_mask'].to(bool)
+
+ if data.meta_info.get('micro_batch_size', None) is not None:
+ batch_size = data.meta_info['micro_batch_size']
+ else:
+ batch_size = self.config.ppo_micro_batch_size
+ batches = split_dict_tensor_into_batches(data.batch, batch_size=batch_size)
+ # compute input shapes for pp stages
+ input_shapes = compute_transformers_input_shapes(
+ batches,
+ meta_info={
+ 'sequence_parallel': self.megatron_config.sequence_parallel,
+ 'hidden_size': self.model_config.hidden_size
+ })
+ n_micro_batch = len(batches)
+ seq_len = batches[0]['input_ids'].shape[1]
+
+ forward_backward_func = get_forward_backward_func()
+
+ def loss_func(output, data, meta_info):
+ if forward_only:
+ if post_process_fn is None:
+ return 1.0, {'logits': output.logits}
+ else:
+ return 1.0, post_process_fn(output, data)
+
+ responses = data['responses']
+ response_length = responses.size(1)
+ attention_mask = data['attention_mask']
+ response_mask = attention_mask[:, -response_length:]
+ old_log_prob = data['old_log_probs']
+ advantages = data['advantages']
+
+ clip_ratio = meta_info['clip_ratio']
+ entropy_coeff = meta_info['entropy_coeff']
+
+ # compute policy loss
+ logits = output.logits
+ logits = logits[:, -response_length - 1:-1]
+ log_prob = vocab_parallel_log_probs_from_logits(logits, responses)
+ pg_loss, pg_clipfrac, ppo_kl = core_algos.compute_policy_loss(old_log_prob=old_log_prob,
+ log_prob=log_prob,
+ advantages=advantages,
+ eos_mask=response_mask,
+ cliprange=clip_ratio)
+ entropy_loss = vocab_parallel_compute_entropy_loss(logits, eos_mask=response_mask)
+ policy_loss = pg_loss - entropy_loss * entropy_coeff
+ # return loss and stats
+ stats = {
+ 'actor/entropy_loss': entropy_loss.detach().item(),
+ 'actor/pg_loss': pg_loss.detach().item(),
+ 'actor/pg_clipfrac': pg_clipfrac.detach().item(),
+ 'actor/ppo_kl': ppo_kl.detach().item()
+ }
+ return policy_loss, stats
+
+ def forward_step(batch_iter, model):
+ batch = next(batch_iter)
+ input_ids = batch['input_ids']
+ attention_mask = batch['attention_mask']
+ position_ids = batch['position_ids']
+ output = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
+ if forward_only:
+ meta_info = None
+ else:
+ meta_info = {'clip_ratio': self.config.clip_ratio, 'entropy_coeff': self.config.entropy_coeff}
+ return output, partial(loss_func, data=batch, meta_info=meta_info)
+
+ # batch should be a list of batches inside micro-batches
+ batch_generator = make_batch_generator(batches, vpp_size=len(self.actor_module))
+
+ # TODO: we may use the new schedule instead
+ # for flash-attn: (seq_len, batch_size, hidden_size) = (mbs*seq_len, 1, hidden_size)
+ if mpu.get_pipeline_model_parallel_world_size() > 1:
+ losses_reduced = forward_backward_func(
+ forward_step_func=forward_step,
+ data_iterator=batch_generator,
+ model=self.actor_module,
+ num_microbatches=n_micro_batch,
+ input_shapes=input_shapes, # must set for flash-attn sequence packing
+ seq_length=batch_size * seq_len, # no use when input_shapes was set
+ hidden_size=self.model_config.hidden_size, # no use when input_shapes was set
+ micro_batch_size=1, # no use when input_shapes was set
+ forward_only=forward_only,
+ )
+ else:
+ losses_reduced = forward_backward_func(
+ forward_step_func=forward_step,
+ data_iterator=batch_generator,
+ model=self.actor_module,
+ num_microbatches=n_micro_batch,
+ seq_length=batch_size * seq_len, # in use for pp = 1
+ hidden_size=self.model_config.hidden_size, # in use for pp = 1
+ micro_batch_size=1, # in use for pp = 1
+ forward_only=forward_only,
+ )
+ # loss_reduces contains the stats returned from loss_func
+ return losses_reduced
+
+ def update_policy(self, dataloader: Iterable[DataProto]) -> Dict:
+ """Update the policy with an iterator of DataProto
+
+ Args:
+ dataloader (Iterable[DataProto]): an iterator over the DataProto that returns by ``make_minibatch_iterator``
+ The keys of each data batch is described in the make_minibatch_iterator.
+
+ Returns:
+ Dict: a dictionary containing the statistics. Note that the statistics are only valid in the last pp stage
+ and users have to combine the output in each dp rank manually.
+
+ """
+ metrics = {}
+ for data in dataloader:
+ # data = data.batch.to(self.actor_module.device)
+ self.actor_optimizer.zero_grad()
+ # use use_contiguous_buffers_in_local_ddp and no overlap_dp_param_comm
+ for chunk in self.actor_module:
+ # if use distributed optimizer, zero grad buffer will be handled by optimizer
+ chunk.zero_grad_buffer(zero_buffer=(not self.actor_optimizer_config.use_distributed_optimizer))
+
+ metric_micro_batch = self.forward_backward_batch(data)
+ for metric in metric_micro_batch:
+ append_to_dict(metrics, metric) # append the metric from this micro-batch to global metrics.
+
+ update_successful, grad_norm, num_zeros_in_grad = self.actor_optimizer.step(
+ self.megatron_config, self.megatron_config.timers)
+ if update_successful:
+ # allgather already execute in optimizer.step in new megatron
+ pass
+ else:
+ raise NotImplementedError
+
+ for metric in metric_micro_batch:
+ append_to_dict(metrics, metric) # append the metric from this micro-batch to global metrics.
+
+ # add empty cache after each compute
+ torch.cuda.empty_cache()
+
+ return metrics
diff --git a/code/RL_model/verl/Search-R1/verl/workers/critic/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/critic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80808f10634b74ee3be94e3dc19e86855f884cc8
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/critic/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BasePPOCritic
+from .dp_critic import DataParallelPPOCritic
+
+__all__ = ["BasePPOCritic", "DataParallelPPOCritic"]
diff --git a/code/RL_model/verl/Search-R1/verl/workers/critic/base.py b/code/RL_model/verl/Search-R1/verl/workers/critic/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1055df4e04d80624d2ca28afcf6f6df3642b91
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/critic/base.py
@@ -0,0 +1,40 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Base class for a critic
+"""
+from abc import ABC, abstractmethod
+
+import torch
+
+from verl import DataProto
+
+__all__ = ['BasePPOCritic']
+
+
+class BasePPOCritic(ABC):
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ @abstractmethod
+ def compute_values(self, data: DataProto) -> torch.Tensor:
+ """Compute values"""
+ pass
+
+ @abstractmethod
+ def update_critic(self, data: DataProto):
+ """Update the critic"""
+ pass
diff --git a/code/RL_model/verl/Search-R1/verl/workers/critic/dp_critic.py b/code/RL_model/verl/Search-R1/verl/workers/critic/dp_critic.py
new file mode 100644
index 0000000000000000000000000000000000000000..0842ff4a489cacd4331112aaefd6719ca22c1294
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/critic/dp_critic.py
@@ -0,0 +1,204 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implement a multiprocess PPOCritic
+"""
+import itertools
+from typing import Iterable
+
+import torch
+import torch.distributed
+from torch import nn, optim
+
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+from verl import DataProto
+from verl.trainer.ppo import core_algos
+from verl.workers.critic import BasePPOCritic
+from verl.utils.py_functional import append_to_dict
+from verl.utils.torch_functional import masked_mean
+from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad
+from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx
+
+from flash_attn.bert_padding import pad_input, unpad_input, rearrange, index_first_axis
+
+__all__ = ['DataParallelPPOCritic']
+
+
+class DataParallelPPOCritic(BasePPOCritic):
+
+ def __init__(self, config, critic_module: nn.Module, critic_optimizer: optim.Optimizer):
+ super().__init__(config=config)
+ self.critic_module = critic_module
+ self.critic_optimizer = critic_optimizer
+ self.use_remove_padding = self.config.model.get('use_remove_padding', False)
+ print(f'Critic use_remove_padding={self.use_remove_padding}')
+
+ assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0
+ self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size
+
+ self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1)
+
+ def _forward_micro_batch(self, micro_batch):
+ response_length = micro_batch['responses'].size(-1)
+ with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+ input_ids = micro_batch['input_ids']
+ batch, seqlen = input_ids.shape
+ attention_mask = micro_batch['attention_mask']
+ position_ids = micro_batch['position_ids']
+
+ if self.use_remove_padding:
+ input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+ attention_mask) # input_ids_rmpad (total_nnz, ...)
+ input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
+
+ # unpad the position_ids to align the rotary
+ position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+ indices).transpose(0, 1)
+
+ # pad and slice the inputs if sp > 1
+ if self.ulysses_sequence_parallel_size > 1:
+ input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs(input_ids_rmpad, \
+ position_ids_rmpad, \
+ sp_size=self.ulysses_sequence_parallel_size)
+
+ # only pass input_ids and position_ids to enable flash_attn_varlen
+ output = self.critic_module(input_ids=input_ids_rmpad,
+ attention_mask=None,
+ position_ids=position_ids_rmpad,
+ use_cache=False) # prevent model thinks we are generating
+ values_rmpad = output.logits
+ values_rmpad = values_rmpad.squeeze(0) # (total_nnz)
+
+ # gather output if sp > 1
+ if self.ulysses_sequence_parallel_size > 1:
+ values_rmpad = gather_outpus_and_unpad(values_rmpad,
+ gather_dim=0,
+ unpad_dim=0,
+ padding_size=pad_size)
+
+ # pad it back
+ values = pad_input(values_rmpad, indices=indices, batch=batch, seqlen=seqlen).squeeze(-1)
+ values = values[:, -response_length - 1:-1]
+ else:
+ output = self.critic_module(input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ use_cache=False) # prevent model thinks we are generating
+ values = output.logits
+ values = values[:, -response_length - 1:-1].squeeze(-1)
+ return values
+
+ def _optimizer_step(self):
+ assert self.config.grad_clip is not None
+
+ if isinstance(self.critic_module, FSDP):
+ grad_norm = self.critic_module.clip_grad_norm_(self.config.grad_clip)
+ else:
+ grad_norm = torch.nn.utils.clip_grad_norm_(self.critic_module.parameters(), max_norm=self.config.grad_clip)
+ self.critic_optimizer.step()
+ return grad_norm
+
+ def compute_values(self, data: DataProto) -> torch.Tensor:
+ self.critic_module.eval()
+ micro_batch_size = data.meta_info['micro_batch_size']
+ select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids']
+ batch = data.select(batch_keys=select_keys).batch
+ use_dynamic_bsz = data.meta_info['use_dynamic_bsz']
+
+ if use_dynamic_bsz:
+ # split using dynamic bsz
+ max_token_len = data.meta_info['max_token_len'] * self.ulysses_sequence_parallel_size
+ micro_batches, indices = rearrange_micro_batches(batch=batch, max_token_len=max_token_len)
+ else:
+ micro_batches = batch.split(micro_batch_size)
+
+ values_lst = []
+ for micro_batch in micro_batches:
+ with torch.no_grad():
+ values = self._forward_micro_batch(micro_batch)
+ values_lst.append(values)
+ values = torch.concat(values_lst, dim=0)
+ responses = data.batch['responses']
+ attention_mask = data.batch['attention_mask']
+ response_length = responses.size(1)
+ values = values * attention_mask[:, -response_length - 1:-1]
+
+ if use_dynamic_bsz:
+ indices = list(itertools.chain.from_iterable(indices))
+ assert len(indices) == values.size(0), f"{len(indices)} vs. {values.size()}"
+ revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
+ values = values[revert_indices]
+
+ return values
+
+ def update_critic(self, data: DataProto):
+ # make sure we are in training mode
+ self.critic_module.train()
+ metrics = {}
+
+ select_keys = ['input_ids', 'responses', 'attention_mask', 'position_ids', 'values', 'returns']
+ batch = data.select(batch_keys=select_keys).batch
+ # Split to make minibatch iterator for updating the actor
+ # See PPO paper for details. https://arxiv.org/abs/1707.06347
+ dataloader = batch.split(self.config.ppo_mini_batch_size)
+
+ for batch_idx, data in enumerate(dataloader):
+ # split batch into micro_batches
+ mini_batch = data
+ if self.config.use_dynamic_bsz:
+ max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
+ micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len)
+ else:
+ micro_batches = mini_batch.split(self.config.ppo_micro_batch_size)
+
+ self.critic_optimizer.zero_grad()
+
+ for data in micro_batches:
+ data = data.cuda() # critic device is cpu when using offload
+ input_ids = data['input_ids']
+ responses = data['responses']
+ attention_mask = data['attention_mask']
+ position_ids = data['position_ids']
+ values = data['values']
+ returns = data['returns']
+ response_length = responses.size(1)
+
+ eos_mask = attention_mask[:, -response_length - 1:-1]
+
+ vpreds = self._forward_micro_batch(data)
+
+ # assert not torch.any(torch.isnan(vpreds)).item()
+
+ vf_loss, vf_clipfrac = core_algos.compute_value_loss(vpreds=vpreds,
+ values=values,
+ returns=returns,
+ eos_mask=eos_mask,
+ cliprange_value=self.config.cliprange_value)
+ loss = vf_loss / self.gradient_accumulation
+ loss.backward()
+
+ data = {
+ 'critic/vf_loss': vf_loss.detach().item(),
+ 'critic/vf_clipfrac': vf_clipfrac.detach().item(),
+ 'critic/vpred_mean': masked_mean(vpreds, eos_mask).detach().item(),
+ }
+
+ append_to_dict(metrics, data)
+
+ grad_norm = self._optimizer_step()
+ data = {'critic/grad_norm': grad_norm.detach().item()}
+ append_to_dict(metrics, data)
+ self.critic_optimizer.zero_grad()
+ return metrics
diff --git a/code/RL_model/verl/Search-R1/verl/workers/critic/megatron_critic.py b/code/RL_model/verl/Search-R1/verl/workers/critic/megatron_critic.py
new file mode 100644
index 0000000000000000000000000000000000000000..a39ad4b460e609373f0283f7171f39127f813189
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/critic/megatron_critic.py
@@ -0,0 +1,229 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implement a multiprocess PPOCritic
+"""
+
+from functools import partial
+from typing import Iterable
+
+import torch
+import torch.distributed
+from omegaconf import OmegaConf
+from torch import nn
+
+from verl import DataProto
+from verl.trainer.ppo import core_algos
+from verl.workers.critic import BasePPOCritic
+from verl.utils.megatron.pipeline_parallel import (compute_transformers_input_shapes, make_batch_generator)
+from verl.utils.py_functional import append_to_dict
+from verl.utils.torch_dtypes import PrecisionType
+from verl.utils.torch_functional import masked_mean, broadcast_dict_tensor, split_dict_tensor_into_batches
+from verl.utils.megatron import sequence_parallel as sp_utils
+from verl.utils.megatron.optimizer_config import OptimizerConfig
+
+from megatron.optimizer import DistributedOptimizer
+from megatron.core import parallel_state as mpu
+from megatron.core.pipeline_parallel import get_forward_backward_func
+
+
+class MegatronPPOCritic(BasePPOCritic):
+
+ def __init__(self, config, model_config, megatron_config, critic_module: nn.ModuleList,
+ critic_optimizer: DistributedOptimizer, critic_optimizer_config: OptimizerConfig):
+ super().__init__(config=config)
+
+ self.model_config = model_config
+ self.megatron_config = megatron_config
+
+ self.critic_module = critic_module
+ self.critic_optimizer = critic_optimizer
+ self.critic_optimizer_config = critic_optimizer_config
+
+ # we create a separate nametuple for optimizer step so that global args won't affect it.
+ self.optimizer_step_args = OmegaConf.create({
+ 'skip_grad': None,
+ 'overlap_dp_param_comm': False,
+ 'overlap_dp_grad_comm': False,
+ 'gradient_accumulation_steps': 1,
+ 'sequence_parallel': self.megatron_config.sequence_parallel,
+ 'DDP_impl': 'local',
+ 'layernorm_allreduce_bucket_threshold': 0,
+ 'pipeline_model_parallel_split_rank': None,
+ 'reduce_grads_use_alltoall': False
+ })
+
+ if self.config.kl_ctrl.type == 'fixed':
+ self.kl_ctrl = core_algos.FixedKLController(kl_coef=self.config.kl_ctrl.kl_coef)
+ elif self.config.kl_ctrl.type == 'adaptive':
+ assert self.config.kl_ctrl.horizon > 0, f'horizon must be larger than 0. Got {self.config.kl_ctrl.horizon}'
+ self.kl_ctrl = core_algos.AdaptiveKLController(init_kl_coef=self.config.kl_ctrl.kl_coef,
+ target_kl=self.config.kl_ctrl.target_kl,
+ horizon=self.config.kl_ctrl.horizon)
+ else:
+ raise NotImplementedError
+
+ def compute_values(self, data: DataProto) -> DataProto:
+ # data.batch = data.batch.to(self.critic_module.module.device)
+ responses = data.batch['responses']
+ attention_mask = data.batch['attention_mask']
+ response_length = responses.size(1)
+ with torch.no_grad():
+ output = self.forward_backward_batch(data=data, forward_only=True)
+ if mpu.is_pipeline_last_stage(ignore_virtual=True):
+ # only on last rank. It should be on every tp rank
+ values = torch.cat([o['vpreds'] for o in output], dim=0) # (bs, seq_size, vocal_size)
+ values = values.to(torch.float32)
+ else:
+ values = torch.empty_like(attention_mask, dtype=torch.float32)
+
+ # each tp ranks should contain the same value
+ values = values * attention_mask
+ values = values[:, -response_length - 1:-1]
+ values = values.contiguous()
+
+ # sync among pp ranks
+ torch.distributed.broadcast(tensor=values,
+ src=mpu.get_pipeline_model_parallel_last_rank(),
+ group=mpu.get_pipeline_model_parallel_group())
+
+ # add empty cache after each compute
+ torch.cuda.empty_cache()
+
+ return values
+
+ def make_minibatch_iterator(self, data: DataProto) -> Iterable[DataProto]:
+ select_keys = ['input_ids', 'responses', 'attention_mask', 'position_ids', 'values', 'returns']
+ data = data.select(batch_keys=select_keys)
+ return data.make_iterator(mini_batch_size=self.config.ppo_mini_batch_size,
+ epochs=self.config.ppo_epochs,
+ dataloader_kwargs={'shuffle': self.config.shuffle})
+
+ def forward_backward_batch(self, data: DataProto, forward_only=False):
+ # broadcast from last pp rank to all other pp ranks
+ data.batch = data.batch.contiguous()
+ broadcast_dict_tensor(data.batch,
+ src=mpu.get_pipeline_model_parallel_last_rank(),
+ group=mpu.get_pipeline_model_parallel_group())
+ # split into micro-batches
+ data.batch['attention_mask'] = data.batch['attention_mask'].to(bool)
+ batches = split_dict_tensor_into_batches(data.batch, batch_size=self.config.ppo_micro_batch_size)
+ n_micro_batch = len(batches)
+ seq_len = batches[0]['input_ids'].shape[1]
+
+ # compute input shapes for pp stages
+ input_shapes = compute_transformers_input_shapes(
+ batches,
+ meta_info={
+ 'sequence_parallel': self.megatron_config.sequence_parallel,
+ 'hidden_size': self.model_config.hidden_size
+ })
+
+ forward_backward_func = get_forward_backward_func()
+
+ def loss_func(output, data, meta_info):
+ if forward_only:
+ return 1.0, {'vpreds': output.logits}
+
+ responses = data['responses']
+ attention_mask = data['attention_mask']
+ values = data['values']
+ returns = data['returns']
+ response_length = responses.size(1)
+
+ eos_mask = attention_mask[:, -response_length:]
+
+ cliprange_value = self.config.cliprange_value
+
+ vpreds = output.logits # (bs, sequence_length)
+ vpreds = vpreds[:, -response_length - 1:-1]
+
+ vf_loss, vf_clipfrac = core_algos.compute_value_loss(vpreds=vpreds,
+ values=values,
+ returns=returns,
+ eos_mask=eos_mask,
+ cliprange_value=cliprange_value)
+ stats = {
+ 'critic/vf_loss': vf_loss.detach().item(),
+ 'critic/vf_clipfrac': vf_clipfrac.detach().item(),
+ 'critic/vpred_mean': masked_mean(vpreds, eos_mask).detach().item(),
+ }
+
+ return vf_loss, stats
+
+ def forward_step(batch_iter, model):
+ batch = next(batch_iter)
+ input_ids = batch['input_ids']
+ attention_mask = batch['attention_mask']
+ position_ids = batch['position_ids']
+ output = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
+ return output, partial(loss_func, data=batch, meta_info={})
+
+ # batch should be a list of batches inside micro-batches
+ batch_generator = make_batch_generator(batches, vpp_size=len(self.critic_module))
+
+ # TODO: we may use the new schedule instead
+ # for flash-attn: (seq_len, batch_size, hidden_size) = (mbs*seq_len, 1, hidden_size)
+ if mpu.get_pipeline_model_parallel_world_size() > 1:
+ losses_reduced = forward_backward_func(
+ forward_step_func=forward_step,
+ data_iterator=batch_generator,
+ model=self.critic_module,
+ num_microbatches=n_micro_batch,
+ input_shapes=input_shapes, # must set for flash-attn sequence packing
+ seq_length=self.config.ppo_micro_batch_size * seq_len, # no use when input_shapes was set
+ hidden_size=self.model_config.hidden_size, # no use when input_shapes was set
+ micro_batch_size=1, # no use when input_shapes was set
+ forward_only=forward_only,
+ )
+ else:
+ losses_reduced = forward_backward_func(
+ forward_step_func=forward_step,
+ data_iterator=batch_generator,
+ model=self.critic_module,
+ num_microbatches=n_micro_batch,
+ seq_length=self.config.ppo_micro_batch_size * seq_len, # in use for pp = 1
+ hidden_size=self.model_config.hidden_size, # in use for pp = 1
+ micro_batch_size=1, # in use for pp = 1
+ forward_only=forward_only,
+ )
+ # loss_reduces contains the stats returned from loss_func
+ return losses_reduced
+
+ def update_critic(self, dataloader: Iterable[DataProto]):
+ metrics = {}
+
+ for data in dataloader:
+ # data = data.batch.to(self.critic_module.device)
+ self.critic_optimizer.zero_grad()
+ # use use_contiguous_buffers_in_local_ddp and no overlap_dp_param_comm
+ for chunk in self.critic_module:
+ chunk.zero_grad_buffer(zero_buffer=(not self.critic_optimizer_config.use_distributed_optimizer))
+
+ metric_micro_batch = self.forward_backward_batch(data)
+
+ update_successful, grad_norm, num_zeros_in_grad = self.critic_optimizer.step(
+ self.megatron_config, self.megatron_config.timers)
+ if update_successful:
+ # allgather already execute in optimizer.step in new megatron
+ pass
+ else:
+ raise NotImplementedError
+
+ for metric in metric_micro_batch:
+ append_to_dict(metrics, metric) # append the metric from this micro-batch to global metrics.
+
+ # add empty cache after each compute
+ torch.cuda.empty_cache()
+ return metrics
diff --git a/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py b/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ba4ea39448b3b4af59f5340f75212761ca4e72
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/fsdp_workers.py
@@ -0,0 +1,1054 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The main entry point to run the PPO algorithm
+"""
+
+import logging
+import os
+import warnings
+
+import torch
+import torch.distributed
+import verl.utils.hdfs_io as hdfs_io
+import verl.utils.torch_functional as verl_F
+from omegaconf import DictConfig, open_dict
+from verl import DataProto
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import register, Dispatch
+from verl.utils import hf_tokenizer
+from verl.utils.debug import log_gpu_memory_usage
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils.fsdp_utils import get_fsdp_wrap_policy, offload_fsdp_grad, init_fn, get_init_weight_context_manager
+from verl.utils.fsdp_utils import offload_fsdp_optimizer, offload_fsdp_param_and_grad, load_fsdp_optimizer, \
+ load_fsdp_param_and_grad
+from verl.utils.import_utils import import_external_libs
+from verl.utils.model import compute_position_id_with_mask
+from verl.utils.flops_counter import FlopsCounter
+from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager
+
+from codetiming import Timer
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN'))
+
+
+class ActorRolloutRefWorker(Worker):
+ """
+ This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy
+ or a hybrid engine based on the config.rollout
+ """
+
+ def __init__(self, config: DictConfig, role: str):
+ super().__init__()
+ self.config = config
+ import torch.distributed
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group(backend="nccl")
+
+ # build device mesh for FSDP
+ world_size = torch.distributed.get_world_size()
+ from torch.distributed.device_mesh import init_device_mesh
+ # TODO(sgm): support FSDP hybrid shard for larger model
+ self.device_mesh = init_device_mesh('cuda', mesh_shape=(world_size,), mesh_dim_names=['fsdp'])
+
+ # build device mesh for Ulysses Sequence Parallel
+ self.ulysses_device_mesh = None
+ self.ulysses_sequence_parallel_size = self.config.actor.get('ulysses_sequence_parallel_size', 1)
+ dp = world_size // self.ulysses_sequence_parallel_size
+ if self.ulysses_sequence_parallel_size > 1:
+ self.ulysses_device_mesh = init_device_mesh('cuda',
+ mesh_shape=(dp, self.ulysses_sequence_parallel_size),
+ mesh_dim_names=['dp', 'sp'])
+
+ self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
+
+ self.role = role
+ assert self.role in ['actor', 'rollout', 'ref', 'actor_rollout', 'actor_rollout_ref']
+
+ self._is_actor = self.role in ['actor', 'actor_rollout', 'actor_rollout_ref']
+ self._is_rollout = self.role in ['rollout', 'actor_rollout', 'actor_rollout_ref']
+ self._is_ref = self.role in ['ref', 'actor_rollout_ref']
+
+ self._is_offload_param = False
+ self._is_offload_grad = False
+ self._is_offload_optimizer = False
+ if self._is_actor:
+ self._is_offload_param = self.config.actor.fsdp_config.get('param_offload', False)
+ self._is_offload_grad = self.config.actor.fsdp_config.get('grad_offload', False)
+ self._is_offload_optimizer = self.config.actor.fsdp_config.get('optimizer_offload', False)
+ elif self._is_ref:
+ # TODO: it seems that manual offload is slowly than FSDP offload
+ self._is_offload_param = self.config.ref.fsdp_config.get('param_offload', False)
+
+ # normalize config
+ if self._is_actor:
+ self.config.actor.ppo_mini_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size)
+ self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] //
+ self.ulysses_sequence_parallel_size)
+ self.config.actor.ppo_mini_batch_size *= self.config.rollout.n
+ self.config.actor.ppo_micro_batch_size *= self.config.rollout.n
+ if self._is_rollout:
+ self.config.rollout.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
+ self.ulysses_sequence_parallel_size)
+ self.config.rollout.log_prob_micro_batch_size *= self.config.rollout.n
+ if self._is_ref:
+ self.config.ref.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
+ self.ulysses_sequence_parallel_size)
+ self.config.ref.log_prob_micro_batch_size *= self.config.rollout.n
+
+ def _build_model_optimizer(self,
+ model_path,
+ fsdp_config,
+ optim_config,
+ override_model_config,
+ use_remove_padding=False,
+ enable_gradient_checkpointing=False,
+ trust_remote_code=False):
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.torch_dtypes import PrecisionType
+ from transformers import AutoModelForCausalLM, AutoConfig
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision
+ from torch import optim
+
+ log_gpu_memory_usage('Before init from HF AutoModel', logger=logger)
+ local_path = copy_local_path_from_hdfs(model_path)
+
+ # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect
+ # TODO(zhangchi.usc1992): 1. support create from random initialized model. 2. Support init with FSDP directly
+ self.tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+
+ torch_dtype = fsdp_config.get('model_dtype', None)
+ if torch_dtype is None:
+ torch_dtype = torch.float32 if self._is_actor else torch.bfloat16
+ else:
+ torch_dtype = PrecisionType.to_dtype(torch_dtype)
+
+ # override model kwargs
+ actor_model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code)
+
+ if use_remove_padding:
+ from verl.models.registry import check_model_support_rmpad
+ check_model_support_rmpad(actor_model_config.model_type)
+
+ if use_remove_padding and self.ulysses_sequence_parallel_size > 1:
+ from verl.models.transformers.monkey_patch import apply_monkey_patch
+ apply_monkey_patch(actor_model_config, verbose=True)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs)
+ if self.rank == 0:
+ print(f'Model config after override: {actor_model_config}')
+
+ # NOTE(fix me): tie_word_embedding causes meta_tensor init to hang
+ init_context = get_init_weight_context_manager(use_meta_tensor=not actor_model_config.tie_word_embeddings)
+
+ with init_context(), warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ actor_module = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=local_path,
+ torch_dtype=torch_dtype,
+ config=actor_model_config,
+ attn_implementation='flash_attention_2',
+ trust_remote_code=trust_remote_code)
+ # some parameters may not in torch_dtype. TODO(zhangchi.usc1992) remove this after we switch to fsdp2
+ actor_module.to(torch_dtype)
+
+ if enable_gradient_checkpointing:
+ actor_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})
+ torch.distributed.barrier()
+
+ if self.rank == 0:
+ print_model_size(actor_module)
+
+ log_gpu_memory_usage('After init from HF AutoModel', logger=logger)
+
+ # We wrap FSDP for rollout as well
+ mixed_precision_config = fsdp_config.get('mixed_precision', None)
+ if mixed_precision_config is not None:
+ param_dtype = PrecisionType.to_dtype(mixed_precision_config.get('param_dtype', 'bf16'))
+ reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get('reduce_dtype', 'fp32'))
+ buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get('buffer_dtype', 'fp32'))
+ else:
+ param_dtype = torch.bfloat16
+ reduce_dtype = torch.float32
+ buffer_dtype = torch.float32
+
+ mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype)
+
+ if self._is_ref:
+ mixed_precision = None
+
+ auto_wrap_policy = get_fsdp_wrap_policy(module=actor_module, config=fsdp_config.get('wrap_policy', None))
+
+ if self._is_rollout and self.config.rollout.name == 'hf':
+ # TODO(zhangchi.usc1992, shengguangming) fix me. Current, auto_wrap_policy causes HFRollout to hang in Gemma
+ auto_wrap_policy = None
+
+ print(f'wrap_policy: {auto_wrap_policy}')
+
+ # TODO(sgm): support hybrid
+ if auto_wrap_policy is None:
+ sharding_strategy = ShardingStrategy.SHARD_GRAD_OP
+ else:
+ sharding_strategy = ShardingStrategy.FULL_SHARD
+
+ # TODO: add transformer policy
+ actor_module_fsdp = FSDP(
+ actor_module,
+ param_init_fn=init_fn,
+ use_orig_params=False,
+ auto_wrap_policy=auto_wrap_policy,
+ device_id=torch.cuda.current_device(),
+ sharding_strategy=sharding_strategy, # zero3
+ mixed_precision=mixed_precision,
+ sync_module_states=True,
+ device_mesh=self.device_mesh,
+ forward_prefetch=False)
+
+ log_gpu_memory_usage('After Actor FSDP init', logger=logger)
+
+ # TODO: add more optimizer args into config
+ if self._is_actor:
+ from verl.utils.torch_functional import get_constant_schedule_with_warmup
+ actor_optimizer = optim.AdamW(actor_module_fsdp.parameters(),
+ lr=optim_config.lr,
+ betas=optim_config.get('betas', (0.9, 0.999)),
+ weight_decay=optim_config.get('weight_decay', 1e-2))
+
+ total_steps = optim_config.get('total_training_steps', 0)
+ num_warmup_steps_ratio = optim_config.get('lr_warmup_steps_ratio', 0.)
+ num_warmup_steps = int(num_warmup_steps_ratio * total_steps)
+
+ print(f'Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}')
+
+ actor_lr_scheduler = get_constant_schedule_with_warmup(optimizer=actor_optimizer,
+ num_warmup_steps=num_warmup_steps)
+ else:
+ actor_optimizer = None
+ actor_lr_scheduler = None
+
+ log_gpu_memory_usage('After actor optimizer init', logger=logger)
+
+ return actor_module_fsdp, actor_optimizer, actor_lr_scheduler, actor_model_config
+
+ def _build_rollout(self):
+ from torch.distributed.device_mesh import init_device_mesh
+ # TODO(sgm): support FSDP hybrid shard for larger model
+ infer_tp = self.config.rollout.tensor_model_parallel_size
+ dp = self.world_size // infer_tp
+ assert self.world_size % infer_tp == 0, f'rollout world_size: {self.world_size} is not divisible by infer_tp: {infer_tp}'
+ rollout_device_mesh = init_device_mesh('cuda', mesh_shape=(dp, infer_tp), mesh_dim_names=['dp', 'infer_tp'])
+
+ if self.config.rollout.name == 'hf':
+ from verl.workers.rollout import HFRollout
+ from verl.workers.sharding_manager import BaseShardingManager
+ rollout = HFRollout(module=self.actor_module_fsdp, config=self.config.rollout)
+ rollout_sharding_manager = BaseShardingManager()
+ # TODO: a sharding manager that do nothing?
+ elif self.config.rollout.name == 'vllm':
+ from verl.workers.rollout.vllm_rollout import vLLMRollout
+ from verl.workers.sharding_manager import FSDPVLLMShardingManager
+ log_gpu_memory_usage('Before building vllm rollout', logger=None)
+ rollout = vLLMRollout(actor_module=self.actor_module_fsdp,
+ config=self.config.rollout,
+ tokenizer=self.tokenizer,
+ model_hf_config=self.actor_model_config)
+ log_gpu_memory_usage('After building vllm rollout', logger=None)
+ if torch.distributed.get_world_size() == 1:
+ self.config.rollout.load_format = 'dummy_hf'
+ rollout_sharding_manager = FSDPVLLMShardingManager(module=self.actor_module_fsdp,
+ inference_engine=rollout.inference_engine,
+ model_config=self.actor_model_config,
+ full_params='hf' in self.config.rollout.load_format,
+ device_mesh=rollout_device_mesh)
+ log_gpu_memory_usage('After building sharding manager', logger=None)
+
+ return rollout, rollout_sharding_manager
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ from verl.workers.actor import DataParallelPPOActor
+ # This is used to import external_lib into the huggingface systems
+ import_external_libs(self.config.model.get('external_lib', None))
+
+ from omegaconf import OmegaConf
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+
+ use_remove_padding = self.config.model.get('use_remove_padding', False)
+
+ if self._is_actor or self._is_rollout:
+ # we need the model for actor and rollout
+ if self._is_actor:
+ optim_config = self.config.actor.optim
+ fsdp_config = self.config.actor.fsdp_config
+ else:
+ optim_config = None
+ fsdp_config = OmegaConf.create()
+ self.actor_module_fsdp, self.actor_optimizer, self.actor_lr_scheduler, self.actor_model_config = self._build_model_optimizer(
+ model_path=self.config.model.path,
+ fsdp_config=fsdp_config,
+ optim_config=optim_config,
+ override_model_config=override_model_config,
+ use_remove_padding=use_remove_padding,
+ enable_gradient_checkpointing=self.config.model.get('enable_gradient_checkpointing', False),
+ trust_remote_code=self.config.model.get('trust_remote_code', False))
+
+ # get the original unwrapped module
+ self.actor_module = self.actor_module_fsdp._fsdp_wrapped_module
+
+ if self._is_offload_param:
+ # param is require during state_dict in sharding manager
+ offload_fsdp_grad(module=self.actor_module_fsdp)
+ log_gpu_memory_usage('After offload actor grad during init', logger=logger)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.actor_optimizer)
+ log_gpu_memory_usage('After offload actor optimizer during init', logger=logger)
+ # load from checkpoint
+ if self._is_actor:
+ OmegaConf.set_struct(self.config.actor, True)
+ with open_dict(self.config.actor):
+ self.config.actor.use_remove_padding = use_remove_padding
+ self.actor = DataParallelPPOActor(config=self.config.actor,
+ actor_module=self.actor_module_fsdp,
+ actor_optimizer=self.actor_optimizer)
+
+ if self._is_rollout:
+ self.rollout, self.rollout_sharding_manager = self._build_rollout()
+
+ if self._is_ref:
+ self.ref_module_fsdp = self._build_model_optimizer(model_path=self.config.model.path,
+ fsdp_config=self.config.ref.fsdp_config,
+ optim_config=None,
+ override_model_config=override_model_config,
+ use_remove_padding=use_remove_padding,
+ trust_remote_code=self.config.model.get(
+ 'trust_remote_code', False))[0]
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad)
+
+ OmegaConf.set_struct(self.config.ref, True)
+ with open_dict(self.config.ref):
+ self.config.ref.use_remove_padding = use_remove_padding
+ self.ref_policy = DataParallelPPOActor(config=self.config.ref, actor_module=self.ref_module_fsdp)
+
+ if self._is_actor:
+ self.flops_counter = FlopsCounter(self.actor_model_config)
+
+ torch.cuda.empty_cache()
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def update_actor(self, data: DataProto):
+ data = data.to('cuda')
+
+ assert self._is_actor
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ load_fsdp_optimizer(optimizer=self.actor_optimizer, device_id=torch.cuda.current_device())
+
+ data.batch = data.batch.cuda()
+
+ log_gpu_memory_usage('Before update policy', logger=logger)
+
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+ # perform training
+ with Timer(name='update_policy', logger=None) as timer:
+ metrics = self.actor.update_policy(data=data)
+ delta_time = timer.last
+ global_num_tokens = data.meta_info['global_token_num']
+ estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
+ metrics['mfu/actor'] = estimated_flops * self.config.actor.ppo_epochs / promised_flops / self.world_size
+
+ self.actor_lr_scheduler.step()
+ lr = self.actor_lr_scheduler.get_last_lr()[0]
+ metrics['actor/lr'] = lr
+
+ log_gpu_memory_usage('After update policy', logger=logger)
+
+ # TODO: here, we should return all metrics
+ output = DataProto(meta_info={'metrics': metrics})
+
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.actor_optimizer)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_log_prob(self, data: DataProto) -> DataProto:
+ """mostly copying from generate_sequences"""
+ data = data.to('cuda')
+
+ assert self._is_rollout
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ data.batch = data.batch.cuda()
+ meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id}
+ data.meta_info.update(meta_info)
+
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data)
+ old_log_probs = self.actor.compute_log_prob(data=data)
+ output = DataProto.from_dict(tensors={'old_log_probs': old_log_probs})
+ output = self.ulysses_sharding_manager.postprocess_data(output)
+
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ # NOTE(sgm): the grad is already in CPU, only offload param here
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+ # clear kv cache
+ torch.cuda.empty_cache()
+ log_gpu_memory_usage('After recompute log prob', logger=logger)
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def generate_sequences(self, prompts: DataProto):
+ prompts = prompts.to('cuda')
+ # set to False if it is validation
+ recompute_log_prob = prompts.meta_info.get('recompute_log_prob', True)
+
+ assert self._is_rollout
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ prompts.batch = prompts.batch.cuda()
+ meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id}
+ prompts.meta_info.update(meta_info)
+ with self.rollout_sharding_manager:
+ log_gpu_memory_usage('After entering rollout sharding manager', logger=logger)
+
+ prompts = self.rollout_sharding_manager.preprocess_data(prompts)
+ output = self.rollout.generate_sequences(prompts=prompts)
+
+ log_gpu_memory_usage('After rollout generation', logger=logger)
+
+ output = self.rollout_sharding_manager.postprocess_data(output)
+
+ if self._is_actor and recompute_log_prob:
+ # we should always recompute old_log_probs when it is HybridEngine
+ output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size
+ output.meta_info['max_token_len'] = self.config.rollout.log_prob_max_token_len_per_gpu
+ output.meta_info['use_dynamic_bsz'] = self.config.rollout.log_prob_use_dynamic_bsz
+ output.meta_info['temperature'] = self.config.rollout.temperature
+ # perform recompute log_prob
+ with self.ulysses_sharding_manager:
+ output = self.ulysses_sharding_manager.preprocess_data(output)
+ old_log_probs = self.actor.compute_log_prob(data=output)
+ output.batch['old_log_probs'] = old_log_probs
+ output = self.ulysses_sharding_manager.postprocess_data(output)
+
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ # NOTE(sgm): the grad is already in CPU, only offload param here
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+ # clear kv cache
+ torch.cuda.empty_cache()
+ log_gpu_memory_usage('After recompute log prob', logger=logger)
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_ref_log_prob(self, data: DataProto):
+ assert self._is_ref
+
+ data = data.to('cuda')
+
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.ref_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ micro_batch_size = self.config.ref.log_prob_micro_batch_size
+ data.meta_info['micro_batch_size'] = micro_batch_size
+ data.meta_info['temperature'] = self.config.rollout.temperature
+ data.meta_info['max_token_len'] = self.config.ref.log_prob_max_token_len_per_gpu
+ data.meta_info['use_dynamic_bsz'] = self.config.ref.log_prob_use_dynamic_bsz
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data)
+ output = self.ref_policy.compute_log_prob(data=data)
+ output = DataProto.from_dict(tensors={'ref_log_prob': output})
+ output = self.ulysses_sharding_manager.postprocess_data(output)
+
+ output = output.to('cpu')
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, local_path, hdfs_path=None):
+ assert self._is_actor
+ import torch
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.actor_module_fsdp,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ # TODO: support DCP and save sharded checkpoints
+ import torch.distributed
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig
+ cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+ with FSDP.state_dict_type(self.actor.actor_module, StateDictType.FULL_STATE_DICT, cfg):
+ state_dict = self.actor.actor_module.state_dict()
+ if self.rank == 0:
+ print(f'Saving actor checkpoint to {local_path}')
+ os.makedirs(local_path, exist_ok=True)
+ self.actor_module.save_pretrained(local_path, state_dict=state_dict)
+ self.tokenizer.save_pretrained(local_path)
+ if hdfs_path is not None:
+ print(f'Uploading actor checkpoint to {hdfs_path}')
+ hdfs_io.makedirs(hdfs_path, exist_ok=True)
+ hdfs_io.copy(src=local_path, dst=hdfs_path)
+
+ torch.distributed.barrier()
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad)
+
+
+class CriticWorker(Worker):
+
+ def __init__(self, config):
+ super().__init__()
+ import torch.distributed
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group(backend="nccl")
+ self.config = config
+
+ # build device mesh for Ulysses Sequence Parallel
+ world_size = torch.distributed.get_world_size()
+ from torch.distributed.device_mesh import init_device_mesh
+ self.ulysses_device_mesh = None
+ self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1)
+ dp = world_size // self.ulysses_sequence_parallel_size
+ if self.ulysses_sequence_parallel_size > 1:
+ self.ulysses_device_mesh = init_device_mesh('cuda',
+ mesh_shape=(dp, self.ulysses_sequence_parallel_size),
+ mesh_dim_names=['dp', 'sp'])
+
+ self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
+
+ # set FSDP offload params
+ self._is_offload_param = self.config.model.fsdp_config.param_offload
+ self._is_offload_grad = self.config.model.fsdp_config.grad_offload
+ self._is_offload_optimizer = self.config.model.fsdp_config.optimizer_offload
+
+ # normalize config
+ self.config.ppo_mini_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size)
+ self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size)
+ self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() //
+ self.ulysses_sequence_parallel_size)
+
+ def _build_critic_model_optimizer(self, config):
+ # the following line is necessary
+ from verl.utils.model import LambdaLayer, print_model_size, squeeze
+ from verl.utils.torch_dtypes import PrecisionType
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision
+ from torch import optim
+
+ local_path = copy_local_path_from_hdfs(config.model.path)
+ # note that the tokenizer between actor and critic may be different. So override tokenizer info with actor info
+ # using random initialized model from any architecture. May not be the same as Actor.
+
+ tokenizer_path = copy_local_path_from_hdfs(config.model.tokenizer_path)
+ self.tokenizer = hf_tokenizer(tokenizer_path, trust_remote_code=config.model.get('trust_remote_code', False))
+
+ from omegaconf import OmegaConf
+ override_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_config)
+ if self.rank == 0:
+ print(f'Critic overriding config {override_config_kwargs}')
+
+ torch_dtype = self.config.model.fsdp_config.get('model_dtype', 'fp32')
+ torch_dtype = PrecisionType.to_dtype(torch_dtype)
+
+ from transformers import AutoConfig, AutoModelForTokenClassification
+ from torch import nn
+
+ trust_remote_code = False
+ critic_model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code)
+ critic_model_config.num_labels = 1
+
+ use_remove_padding = config.model.get('use_remove_padding', False)
+ if use_remove_padding:
+ from verl.models.registry import check_model_support_rmpad
+ check_model_support_rmpad(critic_model_config.model_type)
+
+ if use_remove_padding and self.ulysses_sequence_parallel_size > 1:
+ from verl.models.transformers.monkey_patch import apply_monkey_patch
+ apply_monkey_patch(critic_model_config, verbose=True)
+
+ init_context = get_init_weight_context_manager()
+ with init_context(), warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ setattr(critic_model_config, 'classifier_dropout', 0.)
+ setattr(critic_model_config, 'hidden_dropout', '0')
+ critic_module = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=local_path,
+ torch_dtype=torch_dtype,
+ config=critic_model_config,
+ attn_implementation='flash_attention_2',
+ trust_remote_code=trust_remote_code)
+
+ # some parameters may not in torch_dtype
+ critic_module.to(torch_dtype)
+
+ if config.model.get('enable_gradient_checkpointing', False):
+ critic_module.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})
+ if self.rank == 0:
+ print_model_size(critic_module)
+
+ self.critic_model_config = critic_model_config
+
+ fsdp_config = self.config.model.fsdp_config
+ mixed_precision_config = fsdp_config.get('mixed_precision', None)
+ if mixed_precision_config is not None:
+ param_dtype = PrecisionType.to_dtype(mixed_precision_config.get('param_dtype', 'bf16'))
+ reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get('reduce_dtype', 'fp32'))
+ buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get('buffer_dtype', 'fp32'))
+ else:
+ param_dtype = torch.bfloat16
+ reduce_dtype = torch.float32
+ buffer_dtype = torch.float32
+
+ mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype)
+
+ auto_wrap_policy = get_fsdp_wrap_policy(module=critic_module, config=self.config.model.fsdp_config.wrap_policy)
+
+ log_gpu_memory_usage('Before critic FSDP', logger=None)
+
+ critic_module = FSDP(critic_module,
+ param_init_fn=init_fn,
+ use_orig_params=False,
+ auto_wrap_policy=auto_wrap_policy,
+ device_id=torch.cuda.current_device(),
+ sharding_strategy=ShardingStrategy.FULL_SHARD,
+ mixed_precision=mixed_precision,
+ sync_module_states=True,
+ forward_prefetch=False)
+
+ log_gpu_memory_usage('After critic FSDP', logger=None)
+
+ critic_optimizer = optim.AdamW(critic_module.parameters(),
+ lr=config.optim.lr,
+ betas=config.optim.get('betas', (0.9, 0.999)),
+ weight_decay=config.optim.get('weight_decay', 1e-2))
+
+ total_steps = config.optim.get('total_training_steps', 0)
+ num_warmup_steps_ratio = config.optim.get('lr_warmup_steps_ratio', 0.)
+ num_warmup_steps = int(num_warmup_steps_ratio * total_steps)
+
+ print(f'Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}')
+
+ from verl.utils.torch_functional import get_constant_schedule_with_warmup
+ critic_lr_scheduler = get_constant_schedule_with_warmup(optimizer=critic_optimizer,
+ num_warmup_steps=num_warmup_steps)
+
+ return critic_module, critic_optimizer, critic_lr_scheduler
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # This is used to import external_lib into the huggingface systems
+ import_external_libs(self.config.model.get('external_lib', None))
+
+ from verl.workers.critic import DataParallelPPOCritic
+ self.critic_module, self.critic_optimizer, self.critic_lr_scheduler = self._build_critic_model_optimizer(
+ self.config)
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.critic_optimizer)
+
+ self.critic = DataParallelPPOCritic(config=self.config,
+ critic_module=self.critic_module,
+ critic_optimizer=self.critic_optimizer)
+
+ self.flops_counter = FlopsCounter(self.critic_model_config)
+
+ torch.cuda.empty_cache()
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_values(self, data: DataProto):
+ data = data.to('cuda')
+
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.critic_module,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+ micro_batch_size = self.config.forward_micro_batch_size
+ data.meta_info['micro_batch_size'] = micro_batch_size
+ data.meta_info['max_token_len'] = self.config.forward_max_token_len_per_gpu
+ data.meta_info['use_dynamic_bsz'] = self.config.use_dynamic_bsz
+ # perform forward computation
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+ values = self.critic.compute_values(data=data)
+ output = DataProto.from_dict(tensors={'values': values})
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+
+ output = output.to('cpu')
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def update_critic(self, data: DataProto):
+ data = data.to('cuda')
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.critic_module,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ load_fsdp_optimizer(optimizer=self.critic_optimizer, device_id=torch.cuda.current_device())
+
+ # perform forward computation
+ with self.ulysses_sharding_manager:
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+
+ with Timer(name='update_critic', logger=None) as timer:
+ metrics = self.critic.update_critic(data=data)
+ delta_time = timer.last
+
+ global_num_tokens = data.meta_info['global_token_num']
+ estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
+ metrics['mfu/critic'] = estimated_flops * self.config.ppo_epochs / promised_flops / self.world_size
+
+ self.critic_lr_scheduler.step()
+ lr = self.critic_lr_scheduler.get_last_lr()[0]
+ metrics['critic/lr'] = lr
+
+ output = DataProto(batch=None, meta_info={'metrics': metrics})
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+ if self._is_offload_optimizer:
+ offload_fsdp_optimizer(optimizer=self.critic_optimizer)
+ torch.cuda.empty_cache()
+ output = output.to('cpu')
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, local_path, hdfs_path=None):
+ import torch
+ if self._is_offload_param:
+ load_fsdp_param_and_grad(module=self.critic_module,
+ device_id=torch.cuda.current_device(),
+ load_grad=self._is_offload_grad)
+
+ # TODO: support DCP and save sharded checkpoints
+ import torch.distributed
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig
+ cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+ with FSDP.state_dict_type(self.critic_module, StateDictType.FULL_STATE_DICT, cfg):
+ state_dict = self.critic_module.state_dict()
+ if self.rank == 0:
+ print(f'Saving critic checkpoint to {local_path}')
+ os.makedirs(local_path, exist_ok=True)
+ self.critic_module._fsdp_wrapped_module.save_pretrained(local_path, state_dict=state_dict)
+ self.tokenizer.save_pretrained(local_path)
+ if hdfs_path is not None:
+ print(f'Uploading critic checkpoint to {hdfs_path}')
+ hdfs_io.makedirs(hdfs_path, exist_ok=True)
+ hdfs_io.copy(src=local_path, dst=hdfs_path)
+
+ torch.distributed.barrier()
+ if self._is_offload_param:
+ offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad)
+
+
+# TODO(sgm): we may need to extract it to dp_reward_model.py
+class RewardModelWorker(Worker):
+ """
+ Note that we only implement the reward model that is subclass of AutoModelForTokenClassification.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ import torch.distributed
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group(backend="nccl")
+ self.config = config
+
+ # build device mesh for Ulysses Sequence Parallel
+ world_size = torch.distributed.get_world_size()
+ from torch.distributed.device_mesh import init_device_mesh
+ self.ulysses_device_mesh = None
+ self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1)
+ dp = world_size // self.ulysses_sequence_parallel_size
+ if self.ulysses_sequence_parallel_size > 1:
+ self.ulysses_device_mesh = init_device_mesh('cuda',
+ mesh_shape=(dp, self.ulysses_sequence_parallel_size),
+ mesh_dim_names=['dp', 'sp'])
+
+ self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
+
+ self.use_remove_padding = self.config.model.get('use_remove_padding', False)
+ self.config.micro_batch_size //= torch.distributed.get_world_size()
+
+ def _build_model(self, config):
+ # the following line is necessary
+ from transformers import AutoModelForTokenClassification, AutoConfig
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, CPUOffload
+
+ # download the checkpoint from hdfs
+ local_path = copy_local_path_from_hdfs(config.model.path)
+
+ if self.config.model.input_tokenizer is None:
+ self._do_switch_chat_template = False
+ else:
+ self._do_switch_chat_template = True
+ input_tokenizer_local_path = copy_local_path_from_hdfs(config.model.input_tokenizer)
+ self.input_tokenizer = hf_tokenizer(input_tokenizer_local_path,
+ trust_remote_code=config.model.get('trust_remote_code', False))
+ self.tokenizer = hf_tokenizer(local_path, trust_remote_code=config.model.get('trust_remote_code', False))
+
+ trust_remote_code = config.model.get('trust_remote_code', False)
+ model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code)
+ model_config.num_labels = 1
+
+ use_remove_padding = config.model.get('use_remove_padding', False)
+ if use_remove_padding:
+ from verl.models.registry import check_model_support_rmpad
+ check_model_support_rmpad(model_config.model_type)
+
+ if use_remove_padding and self.ulysses_sequence_parallel_size > 1:
+ from verl.models.transformers.monkey_patch import apply_monkey_patch
+ apply_monkey_patch(model_config, verbose=True)
+
+ # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect
+ init_context = get_init_weight_context_manager(use_meta_tensor=not model_config.tie_word_embeddings)
+
+ with init_context(), warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ setattr(model_config, 'classifier_dropout', 0.)
+ reward_module = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=local_path,
+ config=model_config,
+ torch_dtype=torch.bfloat16,
+ attn_implementation='flash_attention_2',
+ trust_remote_code=trust_remote_code)
+ reward_module.to(torch.bfloat16)
+ auto_wrap_policy = get_fsdp_wrap_policy(module=reward_module, config=self.config.model.fsdp_config)
+
+ reward_module = FSDP(
+ reward_module,
+ param_init_fn=init_fn,
+ use_orig_params=False,
+ auto_wrap_policy=auto_wrap_policy,
+ device_id=torch.cuda.current_device(),
+ sharding_strategy=ShardingStrategy.FULL_SHARD, # zero3
+ sync_module_states=True,
+ cpu_offload=CPUOffload(offload_params=self.config.model.fsdp_config.param_offload),
+ forward_prefetch=False)
+
+ return reward_module
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # This is used to import external_lib into the huggingface systems
+ import_external_libs(self.config.model.get('external_lib', None))
+ self.reward_module = self._build_model(config=self.config)
+ torch.cuda.empty_cache()
+
+ def _forward_micro_batch(self, micro_batch):
+ from flash_attn.bert_padding import pad_input, unpad_input, index_first_axis, rearrange
+ from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad
+
+ with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+ input_ids = micro_batch['input_ids']
+ batch_size, seqlen = input_ids.shape
+ attention_mask = micro_batch['attention_mask']
+ position_ids = micro_batch['position_ids']
+
+ if self.use_remove_padding:
+ input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1),
+ attention_mask) # input_ids_rmpad (total_nnz, ...)
+ input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz)
+
+ # unpad the position_ids to align the rotary
+ position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."),
+ indices).transpose(0, 1)
+
+ # pad and slice the inputs if sp > 1
+ if self.ulysses_sequence_parallel_size > 1:
+ input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs(input_ids_rmpad, \
+ position_ids_rmpad, \
+ sp_size=self.ulysses_sequence_parallel_size)
+
+ # only pass input_ids and position_ids to enable flash_attn_varlen
+ output = self.reward_module(input_ids=input_ids_rmpad,
+ attention_mask=None,
+ position_ids=position_ids_rmpad,
+ use_cache=False) # prevent model thinks we are generating
+ reward_rmpad = output.logits
+ reward_rmpad = reward_rmpad.squeeze(0) # (total_nnz)
+
+ # gather output if sp > 1
+ if self.ulysses_sequence_parallel_size > 1:
+ reward_rmpad = gather_outpus_and_unpad(reward_rmpad,
+ gather_dim=0,
+ unpad_dim=0,
+ padding_size=pad_size)
+
+ # pad it back
+ rm_score = pad_input(reward_rmpad, indices=indices, batch=batch_size, seqlen=seqlen).squeeze(-1)
+ else:
+ output = self.reward_module(input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids)
+ rm_score = output.logits # (batch_size, seq_len, 1)
+ rm_score = rm_score.squeeze(-1)
+
+ # extract the result of the last valid token
+ eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1) # (bsz,)
+ rm_score = rm_score[torch.arange(batch_size), eos_mask_idx]
+ return rm_score
+
+ def _expand_to_token_level(self, data: DataProto, scores: torch.Tensor):
+ batch_size = data.batch.batch_size[0]
+ # expand as token_level_reward
+ attention_mask = data.batch['attention_mask']
+ position_ids = data.batch['position_ids']
+ response_length = data.batch['responses'].shape[-1]
+ eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1) # (bsz,)
+ token_level_scores = torch.zeros_like(attention_mask, dtype=scores.dtype) # (bsz, seqlen)
+ token_level_scores[torch.arange(batch_size), eos_mask_idx] = scores
+
+ # select the response part
+ token_level_scores = token_level_scores[:, -response_length:]
+
+ return token_level_scores
+
+ def _switch_chat_template(self, data: DataProto):
+ src_max_length = data.batch['attention_mask'].shape[-1]
+
+ src_tokenizer = self.input_tokenizer
+ target_tokenizer = self.tokenizer
+
+ rm_input_ids = []
+ rm_attention_mask = []
+
+ for i in range(data.batch.batch_size[0]):
+ # extract raw prompt
+ chat: list = data.non_tensor_batch['raw_prompt'][i].tolist()
+
+ # extract response
+ response_ids = data.batch['responses'][i]
+ response_length = response_ids.shape[-1]
+ valid_response_length = data.batch['attention_mask'][i][-response_length:].sum()
+ valid_response_ids = response_ids[:valid_response_length]
+
+ # decode
+ response = src_tokenizer.decode(valid_response_ids)
+ # remove bos and eos
+ response = response.replace(src_tokenizer.eos_token, '')
+
+ chat.append({'role': 'assistant', 'content': response})
+
+ prompt_with_chat_template = target_tokenizer.apply_chat_template(chat,
+ add_generation_prompt=False,
+ tokenize=False)
+ if self.rank == 0 and i == 0:
+ # for debugging purpose
+ print(f'Switch template. chat: {prompt_with_chat_template}')
+
+ # the maximum length is actually determined by the reward model itself
+ max_length = self.config.get('max_length', src_max_length)
+ if max_length is None:
+ max_length = src_max_length
+ input_ids, attention_mask = verl_F.tokenize_and_postprocess_data(
+ prompt=prompt_with_chat_template,
+ tokenizer=target_tokenizer,
+ max_length=max_length,
+ pad_token_id=target_tokenizer.pad_token_id,
+ left_pad=False, # right padding
+ truncation=self.config.get('truncation', 'right')) # truncate from the right
+
+ rm_input_ids.append(input_ids)
+ rm_attention_mask.append(attention_mask)
+
+ rm_input_ids = torch.cat(rm_input_ids, dim=0)
+ rm_attention_mask = torch.cat(rm_attention_mask, dim=0)
+
+ rm_position_ids = compute_position_id_with_mask(rm_attention_mask)
+
+ rm_inputs = {'input_ids': rm_input_ids, 'attention_mask': rm_attention_mask, 'position_ids': rm_position_ids}
+
+ return DataProto.from_dict(rm_inputs)
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def compute_rm_score(self, data: DataProto):
+ import itertools
+ from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx
+ data = data.to('cuda')
+ if self._do_switch_chat_template:
+ rm_data = self._switch_chat_template(data)
+
+ rm_data.batch = rm_data.batch.cuda()
+
+ # perform forward computation
+ with self.ulysses_sharding_manager:
+ rm_data = self.ulysses_sharding_manager.preprocess_data(data=rm_data)
+ data = self.ulysses_sharding_manager.preprocess_data(data=data)
+
+ use_dynamic_bsz = self.config.use_dynamic_bsz
+ if use_dynamic_bsz:
+ max_token_len = self.config.forward_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
+ micro_batches, indices = rearrange_micro_batches(batch=rm_data.batch, max_token_len=max_token_len)
+ else:
+ micro_batches = rm_data.batch.split(self.config.micro_batch_size)
+ output = []
+ for micro_batch in micro_batches:
+ rm_score = self._forward_micro_batch(micro_batch)
+ output.append(rm_score)
+ scores = torch.cat(output, dim=0) # (batch_size)
+
+ if use_dynamic_bsz:
+ indices = list(itertools.chain.from_iterable(indices))
+ assert len(indices) == scores.size(0), f"{len(indices)} vs. {scores.size()}"
+ revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
+ scores = scores[revert_indices]
+
+ token_level_scores = self._expand_to_token_level(data, scores)
+ # Note that this is only the scores, may not be the final rewards used to train RL
+ output = DataProto.from_dict(tensors={'rm_scores': token_level_scores})
+ output = self.ulysses_sharding_manager.postprocess_data(data=output)
+
+ output = output.to('cpu')
+ torch.cuda.empty_cache()
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py b/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1143b7baa9ed1f15a9660fe892e77a57155b399e
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/megatron_workers.py
@@ -0,0 +1,735 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The main entry point to run the PPO algorithm
+"""
+
+import os
+import logging
+import ray
+import torch
+import torch.distributed
+import torch.nn as nn
+from omegaconf import DictConfig
+from verl.single_controller.base.megatron.worker import MegatronWorker
+from verl.workers.actor.megatron_actor import MegatronPPOActor
+from verl.workers.critic.megatron_critic import MegatronPPOCritic
+from verl.workers.sharding_manager import AllGatherPPModel
+from verl.workers.reward_model.megatron.reward_model import MegatronRewardModel
+
+from verl.single_controller.base.decorator import register, Dispatch
+from verl import DataProto
+from verl.utils.fs import copy_local_path_from_hdfs
+from verl.utils.debug import log_gpu_memory_usage
+from verl.utils.model import load_megatron_model_weights
+from verl.utils.megatron_utils import init_model_parallel_config
+from verl.utils.megatron_utils import offload_megatron_param_and_grad, load_megatron_param_and_grad
+from verl.utils import hf_tokenizer
+
+from megatron.core import parallel_state as mpu
+from megatron.core import ModelParallelConfig
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN'))
+
+
+def set_random_seed(seed):
+ import torch
+ import numpy as np
+ import random
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+ random.seed(seed)
+ if torch.cuda.device_count() > 0:
+ from megatron.core import tensor_parallel
+ tensor_parallel.model_parallel_cuda_manual_seed(seed)
+ # FIXME: torch cumsum not support deterministic (used in vllm sampler),
+ # https://github.com/pytorch/pytorch/issues/89492
+ # torch.use_deterministic_algorithms(True, warn_only=True)
+ # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+
+
+class ActorRolloutRefWorker(MegatronWorker):
+ """
+ This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy
+ or a hybrid engine based on the config.rollout
+ """
+
+ def __init__(self, config: DictConfig, role: str):
+ super().__init__()
+ self.config = config
+
+ # NOTE(sgm): We utilize colocate WorkerGroup by default.
+ # As a result, Workers for different model share the same process.
+ # Therefore, we only require one distribute initialization.
+ # To utilize different parallel startegy in different models:
+ # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models,
+ # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385
+ if not torch.distributed.is_initialized():
+ rank = int(os.environ['LOCAL_RANK'])
+ torch.distributed.init_process_group(backend="nccl")
+ torch.cuda.set_device(rank)
+
+ if self.config.actor.megatron.sequence_parallel:
+ os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+ mpu.initialize_model_parallel(
+ tensor_model_parallel_size=self.config.actor.megatron.tensor_model_parallel_size,
+ pipeline_model_parallel_size=self.config.actor.megatron.pipeline_model_parallel_size,
+ virtual_pipeline_model_parallel_size=None,
+ pipeline_model_parallel_split_rank=None,
+ use_sharp=False,
+ context_parallel_size=1,
+ expert_model_parallel_size=1,
+ nccl_communicator_config_path=None,
+ )
+
+ set_random_seed(seed=self.config.actor.megatron.seed)
+
+ self.role = role
+ assert self.role in ['actor', 'rollout', 'ref', 'actor_rollout', 'actor_rollout_ref']
+
+ self._is_actor = self.role in ['actor', 'actor_rollout', 'actor_rollout_ref']
+ self._is_rollout = self.role in ['rollout', 'actor_rollout', 'actor_rollout_ref']
+ self._is_ref = self.role in ['ref', 'actor_rollout_ref']
+
+ # TODO(sgm): Currently, we only support reference model param offload
+ # will support other offload later
+ self._is_offload_param = False
+ self._is_offload_grad = False
+ self._is_offload_optimizer = False
+
+ # normalize config
+ if self._is_actor and self._is_rollout:
+ self.config.actor.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
+ self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
+ self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
+ self._is_offload_param = self.config.actor.get('param_offload', False)
+ self._is_offload_grad = self.config.actor.get('grad_offload', False)
+ self._is_offload_optimizer = self.config.actor.get('optimizer_offload', False)
+ elif self._is_ref:
+ self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
+ self._is_offload_param = self.config.ref.get('param_offload', False)
+
+ def _build_model_optimizer(self,
+ model_path,
+ megatron_config: ModelParallelConfig,
+ optim_config,
+ override_model_config,
+ enable_gradient_checkpointing=False):
+ from verl.utils.megatron.optimizer import get_megatron_optimizer
+ from megatron.core.models.gpt.gpt_model import ModelType
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.megatron_utils import get_model, init_megatron_optim_config
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+
+ # Step 1: initialize the tokenizer
+ local_path = copy_local_path_from_hdfs(model_path)
+ self.tokenizer = hf_tokenizer(local_path)
+
+ # Step 2: get the actor_model_config
+ actor_model_config = AutoConfig.from_pretrained(local_path)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(actor_model_config, override_config_kwargs=override_config_kwargs)
+
+ if self.rank == 0:
+ print(f'Model config after override: {actor_model_config}')
+
+ def megatron_actor_model_provider(pre_process, post_process):
+ from verl.utils.model import get_parallel_model_from_config
+ # vpp is not supported yet because it will hang for some reason. Need debugging
+ vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model
+ # this_megatron_config = copy.deepcopy(megatron_config)
+ # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+ parallel_model = get_parallel_model_from_config(config=actor_model_config,
+ megatron_config=megatron_config,
+ pre_process=pre_process,
+ post_process=post_process,
+ value=False)
+ parallel_model.cuda()
+ return parallel_model
+
+ # Step 3: initialize the megatron model
+ if self._is_actor and self._is_rollout:
+ # Initialize the 3D HybridEngine
+ hybrid_engine = AllGatherPPModel(model_provider=megatron_actor_model_provider)
+ # Fetch the model at current rank
+ actor_module = hybrid_engine.this_rank_models
+ if isinstance(actor_module, nn.ModuleList):
+ actor_module = [actor_module[0]]
+ if self.config.actor.load_weight:
+ load_megatron_model_weights(self.config,
+ actor_model_config,
+ actor_module,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=False)
+
+ if self.rank == 0:
+ print_model_size(actor_module[0])
+ log_gpu_memory_usage('After AllGatherPPModel init', logger=logger)
+ elif self._is_ref:
+ print(f'self.config.ref.load_weight: {self.config.ref.load_weight}')
+ ref_module = get_model(model_provider_func=megatron_actor_model_provider,
+ model_type=ModelType.encoder_or_decoder,
+ wrap_with_ddp=False)
+ # ref_module = nn.ModuleList(ref_module)
+
+ if self.config.ref.load_weight: # should align with the actor:
+ assert self.config.actor.load_weight == self.config.ref.load_weight
+ print(f'load ref weight start')
+ load_megatron_model_weights(self.config,
+ actor_model_config,
+ ref_module,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=False)
+ log_gpu_memory_usage('After ref module init', logger=logger)
+ return ref_module, actor_model_config
+
+ # TODO: add more optimizer args into config
+ if self._is_actor:
+ optim_config = init_megatron_optim_config(optim_config)
+ actor_optimizer = get_megatron_optimizer(model=actor_module, config=optim_config)
+ else:
+ optim_config = None
+ actor_optimizer = None
+
+ log_gpu_memory_usage('After actor optimizer init', logger=logger)
+
+ return actor_module, hybrid_engine, actor_optimizer, actor_model_config, optim_config
+
+ def _build_rollout(self):
+ if self.config.rollout.name == 'vllm':
+ from verl.workers.rollout.vllm_rollout import vLLMRollout
+ from verl.workers.sharding_manager import MegatronVLLMShardingManager
+ from verl.utils.model import normalize_pp_vpp_params
+
+ # NOTE(sgm): If the QKV and gate_up projection layer are concate together in actor,
+ # we will reorganize their weight format when resharding from actor to rollout.
+ layer_name_mapping = {
+ "qkv_layer_name":
+ self.config.rollout.layer_name_map.get("qkv_layer_name", "qkv"),
+ "gate_proj_layer_name":
+ self.config.rollout.layer_name_map.get("gate_proj_layer_name", "linear_fc1.weight"),
+ }
+
+ # reshard the weight partition from actor to rollout to initialize the rollout class
+ # create a new cuda space for parameters not in this pp rank
+ self.hybrid_engine.load_params_to_cuda()
+ # broadcast the parameters from pp rank to other ranks
+ self.hybrid_engine.allgather_params()
+ # obtain name to parameters in pp/vpp
+ params = self.hybrid_engine.get_all_params()
+ # update the param name for the
+ params = normalize_pp_vpp_params(params=params,
+ num_hidden_layers=self.actor_model_config.num_hidden_layers,
+ layer_name='layers')
+ rollout = vLLMRollout(actor_module=params,
+ config=self.config.rollout,
+ tokenizer=self.tokenizer,
+ model_hf_config=self.actor_model_config,
+ train_tp=mpu.get_tensor_model_parallel_world_size())
+ log_gpu_memory_usage('After building vllm rollout', logger=logger)
+
+ # perform weight resharding between actor and rollout
+ sharding_manager = MegatronVLLMShardingManager(module=self.hybrid_engine,
+ inference_engine=rollout.inference_engine,
+ model_config=self.actor_model_config,
+ layer_name_mapping=layer_name_mapping)
+ log_gpu_memory_usage('After building sharding manager', logger=logger)
+ else:
+ NotImplementedError('Only vllmRollout is supported with Megatron now')
+
+ return rollout, sharding_manager
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ if self.config.model.get('external_lib', None) is not None:
+ # This is used to import external_lib into the huggingface systems
+ import importlib
+ importlib.import_module(self.config.model.external_lib)
+
+ from omegaconf import OmegaConf
+ from verl.utils.torch_dtypes import PrecisionType
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+ torch_dtype = torch.bfloat16
+
+ megatron_config = OmegaConf.create({
+ 'sequence_parallel': self.config.actor.megatron.get('sequence_parallel', True),
+ 'param_dtype': PrecisionType.to_str(torch_dtype),
+ 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),
+ 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),
+ 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),
+ 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),
+ 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()
+ })
+
+ megatron_config = init_model_parallel_config(megatron_config)
+
+ if self._is_actor or self._is_rollout:
+ # we need the model for actor and rollout
+ if self._is_actor:
+ optim_config = self.config.actor.optim
+ else:
+ optim_config = None
+ self.actor_module, self.hybrid_engine, self.actor_optimizer, \
+ self.actor_model_config, self.actor_optim_config = self._build_model_optimizer(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ optim_config=optim_config,
+ override_model_config=override_model_config,
+ )
+
+ if self._is_actor:
+ self.actor = MegatronPPOActor(config=self.config.actor,
+ model_config=self.actor_model_config,
+ megatron_config=megatron_config,
+ actor_module=self.actor_module,
+ actor_optimizer=self.actor_optimizer,
+ actor_optimizer_config=self.actor_optim_config)
+
+ if self._is_rollout:
+ self.rollout, self.sharding_manager = self._build_rollout()
+
+ if self._is_ref:
+ self.ref_module, self.ref_model_config = self._build_model_optimizer(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ optim_config=None,
+ override_model_config=override_model_config,
+ )
+ self.ref_policy = MegatronPPOActor(config=self.config.ref,
+ model_config=self.ref_model_config,
+ megatron_config=megatron_config,
+ actor_module=self.ref_module,
+ actor_optimizer=None,
+ actor_optimizer_config=None)
+
+ torch.cuda.empty_cache()
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def update_actor(self, data: DataProto):
+ assert self._is_actor
+
+ data.batch = data.batch.cuda()
+
+ log_gpu_memory_usage('Before update policy', logger=logger)
+
+ dataloader = self.actor.make_minibatch_iterator(data=data)
+ metrics = self.actor.update_policy(dataloader=dataloader)
+
+ log_gpu_memory_usage('After update policy', logger=logger)
+
+ # TODO: here, we should return all metrics
+ output = DataProto(meta_info={'metrics': metrics})
+ output = output.to('cpu')
+ torch.cuda.empty_cache()
+ return output
+
+ # @register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO)
+ # def compute_log_prob(self, data: DataProto) -> DataProto:
+ # assert self._is_rollout
+ # output = self.actor.compute_log_prob(data=data)
+ # output = DataProto.from_dict(tensors={'old_log_probs': output})
+ # torch.cuda.empty_cache()
+ # return output
+
+ @register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO)
+ def generate_sequences(self, prompts: DataProto):
+ assert self._is_rollout
+
+ prompts.batch = prompts.batch.cuda()
+ meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id}
+ prompts.meta_info.update(meta_info)
+ with self.sharding_manager:
+ log_gpu_memory_usage('After entering sharding manager', logger=logger)
+
+ prompts = self.sharding_manager.preprocess_data(prompts)
+ output = self.rollout.generate_sequences(prompts=prompts)
+
+ log_gpu_memory_usage('After rollout generation', logger=logger)
+
+ output = self.sharding_manager.postprocess_data(output)
+
+ validate = prompts.meta_info.get('validate', False)
+ if self._is_actor and not validate:
+ # we should always recompute old_log_probs when it is HybridEngine
+ output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size
+ output.meta_info['temperature'] = self.config.rollout.temperature
+ old_log_probs = self.actor.compute_log_prob(data=output)
+ output.batch['old_log_probs'] = old_log_probs
+
+ output = output.to('cpu')
+ # clear kv cache
+ torch.cuda.empty_cache()
+ log_gpu_memory_usage('After recompute log prob', logger=logger)
+ return output
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def compute_ref_log_prob(self, data: DataProto):
+ data = data.to('cuda')
+
+ assert self._is_ref
+ if self._is_offload_param:
+ load_megatron_param_and_grad(self.ref_module, torch.cuda.current_device(), self._is_offload_grad)
+
+ micro_batch_size = self.config.rollout.log_prob_micro_batch_size
+ data.meta_info['micro_batch_size'] = micro_batch_size
+ data.meta_info['temperature'] = self.config.rollout.temperature
+ output = self.ref_policy.compute_log_prob(data=data)
+ output = DataProto.from_dict(tensors={'ref_log_prob': output})
+ output = output.to('cpu')
+ if self._is_offload_param:
+ offload_megatron_param_and_grad(self.ref_module, self._is_offload_grad)
+ torch.cuda.empty_cache()
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def load_checkpoint(self, checkpoint_path):
+ pass
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def load_pretrained_model(self, checkpoint_path):
+ pass
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, checkpoint_path):
+ assert self._is_actor
+ pass
+
+
+class CriticWorker(MegatronWorker):
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ # NOTE(sgm): We utilize colocate WorkerGroup by default.
+ # As a result, Workers for different model share the same process.
+ # Therefore, we only require one distribute initialization.
+ # To utilize different parallel startegy in different models:
+ # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models,
+ # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385
+ if not torch.distributed.is_initialized():
+ rank = int(os.environ['LOCAL_RANK'])
+ torch.distributed.init_process_group(backend="nccl")
+ torch.cuda.set_device(rank)
+
+ if self.config.megatron.sequence_parallel:
+ os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+ mpu.initialize_model_parallel(
+ tensor_model_parallel_size=self.config.megatron.tensor_model_parallel_size,
+ pipeline_model_parallel_size=self.config.megatron.pipeline_model_parallel_size,
+ virtual_pipeline_model_parallel_size=None,
+ pipeline_model_parallel_split_rank=None,
+ use_sharp=False,
+ context_parallel_size=1,
+ expert_model_parallel_size=1,
+ nccl_communicator_config_path=None,
+ )
+
+ set_random_seed(seed=self.config.megatron.seed)
+
+ # normalize config
+ self.config.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
+ self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
+
+ # TODO(sgm): support critic model offload
+
+ def _build_critic_model_optimizer(self,
+ model_path,
+ megatron_config: ModelParallelConfig,
+ optim_config,
+ override_model_config,
+ enable_gradient_checkpointing=False):
+ from megatron.core.models.gpt.gpt_model import ModelType
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.megatron.optimizer import get_megatron_optimizer
+ from verl.utils.megatron_utils import get_model, init_megatron_optim_config, init_model_parallel_config
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+
+ # Step 1: initialize the tokenizer
+ local_path = copy_local_path_from_hdfs(model_path)
+ self.tokenizer = hf_tokenizer(local_path)
+
+ # Step 2: get the actor_model_config
+ critic_model_config = AutoConfig.from_pretrained(local_path)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(critic_model_config, override_config_kwargs=override_config_kwargs)
+
+ if self.rank == 0:
+ print(f'Model config after override: {critic_model_config}')
+
+ def megatron_critic_model_provider(pre_process, post_process):
+ from verl.utils.model import get_parallel_model_from_config
+ # TODO: support vpp here
+ # vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model
+ # this_megatron_config = copy.deepcopy(megatron_config)
+ # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+ parallel_model = get_parallel_model_from_config(config=critic_model_config,
+ megatron_config=megatron_config,
+ pre_process=pre_process,
+ post_process=post_process,
+ value=True)
+ parallel_model.cuda()
+ return parallel_model
+
+ # Step 3: initialize the megatron model
+ critic_module = get_model(model_provider_func=megatron_critic_model_provider,
+ model_type=ModelType.encoder_or_decoder,
+ wrap_with_ddp=True)
+ # note that here critic_module will be a list to be compatible with the construction of interleaved pp (vpp).
+ # but here, we do not use pp (vpp) yet. For simplicity, we remove the list
+ # critic_module = nn.ModuleList(critic_module)
+
+ if self.config.load_weight:
+ load_megatron_model_weights(self.config,
+ critic_model_config,
+ critic_module,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=True)
+ if self.rank == 0:
+ print_model_size(critic_module[0])
+
+ # TODO: add more optimizer args into config
+ optim_config = init_megatron_optim_config(optim_config)
+ critic_optimizer = get_megatron_optimizer(model=critic_module, config=optim_config)
+ torch.cuda.empty_cache()
+ return critic_module, critic_optimizer, critic_model_config, optim_config
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # create critic
+ from omegaconf import OmegaConf
+ from verl.utils.torch_dtypes import PrecisionType
+
+ if self.config.model.get('external_lib', None) is not None:
+ # This is used to import external_lib into the huggingface systems
+ import importlib
+ importlib.import_module(self.config.model.external_lib)
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+ torch_dtype = torch.bfloat16
+
+ megatron_config = OmegaConf.create({
+ 'sequence_parallel': self.config.megatron.get('sequence_parallel', True),
+ 'param_dtype': PrecisionType.to_str(torch_dtype),
+ 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),
+ 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),
+ 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),
+ 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),
+ 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()
+ })
+
+ megatron_config = init_model_parallel_config(megatron_config)
+
+ critic_module, critic_optimizer, critic_model_config, critic_optimizer_config = self._build_critic_model_optimizer(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ optim_config=self.config.optim,
+ override_model_config=override_model_config)
+ self.critic = MegatronPPOCritic(config=self.config,
+ model_config=critic_model_config,
+ megatron_config=megatron_config,
+ critic_module=critic_module,
+ critic_optimizer=critic_optimizer,
+ critic_optimizer_config=critic_optimizer_config)
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def compute_values(self, data: DataProto):
+ data = data.to('cuda')
+ values = self.critic.compute_values(data=data)
+ output = DataProto.from_dict(tensors={'values': values})
+ output = output.to('cpu')
+ return output
+
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def update_critic(self, data: DataProto):
+ data = data.to('cuda')
+ dataloader = self.critic.make_minibatch_iterator(data)
+ metrics = self.critic.update_critic(dataloader=dataloader)
+ output = DataProto(batch=None, meta_info={'metrics': metrics})
+ output = output.to('cpu')
+ return output
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def load_checkpoint(self, checkpoint_path):
+ pass
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def save_checkpoint(self, checkpoint_path):
+ pass
+
+
+class RewardModelWorker(MegatronWorker):
+ """
+ Note that we only implement the reward model that is subclass of AutoModelForSequenceClassification.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ # NOTE(sgm): We utilize colocate WorkerGroup by default.
+ # As a result, Workers for different model share the same process.
+ # Therefore, we only require one distribute initialization.
+ # To utilize different parallel startegy in different models:
+ # 1, users should disable WorkerDict; 2.assign different ResourcePool to different models,
+ # 3. and apply the following patch in ray==2.10, https://github.com/ray-project/ray/pull/44385
+ if not torch.distributed.is_initialized():
+ rank = int(os.environ['LOCAL_RANK'])
+ torch.distributed.init_process_group(backend="nccl")
+ torch.cuda.set_device(rank)
+
+ if self.config.megatron.sequence_parallel:
+ os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+ mpu.initialize_model_parallel(
+ tensor_model_parallel_size=self.config.megatron.tensor_model_parallel_size,
+ pipeline_model_parallel_size=self.config.megatron.pipeline_model_parallel_size,
+ virtual_pipeline_model_parallel_size=None,
+ pipeline_model_parallel_split_rank=None,
+ use_sharp=False,
+ context_parallel_size=1,
+ expert_model_parallel_size=1,
+ nccl_communicator_config_path=None,
+ )
+
+ set_random_seed(seed=self.config.megatron.seed)
+
+ # normalize config
+ self.config.micro_batch_size //= mpu.get_data_parallel_world_size()
+
+ def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config):
+ from megatron.core.models.gpt.gpt_model import ModelType
+ from verl.utils.model import print_model_size, update_model_config
+ from verl.utils.megatron_utils import get_model
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+
+ # Step 1: initialize the tokenizer
+ local_path = copy_local_path_from_hdfs(model_path)
+ self.tokenizer = hf_tokenizer(local_path)
+
+ # Step 2: get the actor_model_config
+ rm_model_config = AutoConfig.from_pretrained(local_path)
+
+ override_config_kwargs = {
+ 'bos_token_id': self.tokenizer.bos_token_id,
+ 'eos_token_id': self.tokenizer.eos_token_id,
+ 'pad_token_id': self.tokenizer.pad_token_id,
+ }
+ override_config_kwargs.update(override_model_config)
+ update_model_config(rm_model_config, override_config_kwargs=override_config_kwargs)
+
+ if self.rank == 0:
+ print(f'Model config after override: {rm_model_config}')
+
+ def megatron_rm_model_provider(pre_process, post_process):
+ from verl.utils.model import get_parallel_model_from_config
+ # vpp is not supported yet because it will hang for some reason. Need debugging
+ vpp_rank = mpu.get_virtual_pipeline_model_parallel_rank() # this will be set inside get_model
+ # this_megatron_config = copy.deepcopy(megatron_config)
+ # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+ parallel_model = get_parallel_model_from_config(config=rm_model_config,
+ megatron_config=megatron_config,
+ pre_process=pre_process,
+ post_process=post_process,
+ value=True)
+ parallel_model.cuda()
+ return parallel_model
+
+ # Step 3: initialize the megatron model
+ reward_model = get_model(model_provider_func=megatron_rm_model_provider,
+ model_type=ModelType.encoder_or_decoder,
+ wrap_with_ddp=False)
+ # note that here critic_module will be a list to be compatible with the construction of interleaved pp (vpp).
+ # but here, we do not use pp (vpp) yet. For simplicity, we remove the list
+ # reward_model = nn.ModuleList(reward_model)
+
+ if self.config.load_weight:
+ load_megatron_model_weights(self.config,
+ rm_model_config,
+ reward_model,
+ params_dtype=megatron_config.params_dtype,
+ is_value_model=True)
+
+ # TODO: add more optimizer args into config
+ torch.cuda.empty_cache()
+ return reward_model, rm_model_config
+
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def init_model(self):
+ # create critic
+ from omegaconf import OmegaConf
+ from verl.utils.torch_dtypes import PrecisionType
+ from transformers import AutoTokenizer
+
+ if self.config.model.get('external_lib', None) is not None:
+ # This is used to import external_lib into the huggingface systems
+ import importlib
+ importlib.import_module(self.config.model.external_lib)
+ override_model_config = OmegaConf.to_container(self.config.model.get('override_config', OmegaConf.create()))
+
+ sft_tokenizer_local_path = copy_local_path_from_hdfs(self.config.model.input_tokenizer)
+ sft_tokenizer = hf_tokenizer(sft_tokenizer_local_path)
+ rm_tokenizer_path = self.config.model.get('rm_tokenizer', None)
+ rm_tokenizer = None
+ if rm_tokenizer_path is not None:
+ rm_tokenizer_local_path = copy_local_path_from_hdfs(rm_tokenizer_path)
+ rm_tokenizer = hf_tokenizer(rm_tokenizer_local_path)
+
+ torch_dtype = torch.bfloat16
+
+ megatron_config = OmegaConf.create({
+ 'sequence_parallel': self.config.megatron.get('sequence_parallel', True),
+ 'param_dtype': PrecisionType.to_str(torch_dtype),
+ 'tensor_model_parallel_size': mpu.get_tensor_model_parallel_world_size(),
+ 'pipeline_model_parallel_rank': mpu.get_pipeline_model_parallel_rank(),
+ 'pipeline_model_parallel_size': mpu.get_pipeline_model_parallel_world_size(),
+ 'virtual_pipeline_model_parallel_rank': mpu.get_virtual_pipeline_model_parallel_rank(),
+ 'virtual_pipeline_model_parallel_size': mpu.get_virtual_pipeline_model_parallel_world_size()
+ })
+
+ megatron_config = init_model_parallel_config(megatron_config)
+
+ reward_model_module, reward_model_config = self._build_rm_model(
+ model_path=self.config.model.path,
+ megatron_config=megatron_config,
+ override_model_config=override_model_config,
+ )
+ # FIXME(sgm): reward model param offload is implemented in MegatronRewardModel
+ # should be implemented in workers
+ self.rm = MegatronRewardModel(config=self.config,
+ reward_model_module=reward_model_module,
+ model_config=reward_model_config,
+ megatron_config=megatron_config,
+ sft_tokenizer=sft_tokenizer,
+ rm_tokenizer=rm_tokenizer)
+
+ # TODO: reward model use itself tokenizer instead of sft tokenizer
+ # the input_ids, responses, attention_mask and position_ids may be different!
+ @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO)
+ def compute_rm_score(self, data: DataProto):
+ data.batch = data.batch.cuda()
+ output = self.rm.compute_reward(data)
+ output = output.to('cpu')
+ return output
diff --git a/code/RL_model/verl/Search-R1/verl/workers/reward_model/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/reward_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b48a750841888b1e220b72422659d8073c22a0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/reward_model/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BasePPORewardModel
diff --git a/code/RL_model/verl/Search-R1/verl/workers/reward_model/base.py b/code/RL_model/verl/Search-R1/verl/workers/reward_model/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02487db3846d0fcec76c1c216fbbb52d15c64bd
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/reward_model/base.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The base class for reward model
+"""
+
+from abc import ABC, abstractmethod
+
+from verl import DataProto
+
+
+class BasePPORewardModel(ABC):
+
+ def __init__(self, config):
+ self.config = config
+
+ @abstractmethod
+ def compute_reward(self, data: DataProto) -> DataProto:
+ """Computing reward given input_ids. The transformers should output a tensor with shape
+ [batch_size, sequence_length], and the value at [EOS] mask should be gathered.
+
+ Args:
+ data: must contain keys "input_ids", "attention_mask" and "position_ids".
+ - input_ids: [batch_size, sequence_length]
+ - attention_mask: [batch_size, sequence_length]
+ - position_ids: [batch_size, sequence_length]
+
+ Returns: a data pass protocol containing "reward". Only the [EOS] position contains the reward.
+ Other position should have zero reward. Note that this may change in the future if we use
+ dense reward. So, we leave the interface for general case.
+ - reward: [batch_size, sequence_length].
+
+ """
+ pass
diff --git a/code/RL_model/verl/Search-R1/verl/workers/reward_model/megatron/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/reward_model/megatron/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0956b4cc53b81bf4c675c235968e1fc577a49f9
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/reward_model/megatron/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .reward_model import MegatronRewardModel
diff --git a/code/RL_model/verl/Search-R1/verl/workers/reward_model/megatron/reward_model.py b/code/RL_model/verl/Search-R1/verl/workers/reward_model/megatron/reward_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7b3bb4c128bc528ae3d68b8ba34c3cea31c6c0d
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/reward_model/megatron/reward_model.py
@@ -0,0 +1,278 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Megatron Reward Model.
+"""
+
+from tensordict import TensorDict
+from functools import partial
+from verl import DataProto
+from verl.utils.torch_functional import logprobs_from_logits
+import torch
+import torch
+import torch.distributed
+
+from verl.utils.torch_functional import get_eos_mask, pad_sequence_to_length
+from verl.utils.megatron.pipeline_parallel import (compute_transformers_input_shapes, make_batch_generator)
+from verl import DataProto
+from verl.utils.torch_functional import logprobs_from_logits, broadcast_dict_tensor, split_dict_tensor_into_batches
+from verl.utils.torch_dtypes import PrecisionType
+from verl.workers.reward_model.base import BasePPORewardModel
+from verl.utils.megatron import sequence_parallel as sp_utils
+from megatron.core import parallel_state as mpu
+from megatron.core.pipeline_parallel import get_forward_backward_func
+
+
+class MegatronRewardModel(BasePPORewardModel):
+
+ def __init__(self,
+ config,
+ model_config,
+ reward_model_module: torch.nn.ModuleList,
+ megatron_config,
+ sft_tokenizer=None,
+ rm_tokenizer=None):
+ self.config = config
+ self.reward_model_module = reward_model_module
+ self.megatron_config = megatron_config
+ self.model_config = model_config
+ self.device = 'cuda'
+ self.sft_tokenizer = sft_tokenizer
+ self.rm_tokenizer = rm_tokenizer
+ self.use_different_tokenizer = rm_tokenizer is not None
+
+ if self.config.param_offload:
+ self.offload_params_to_cpu()
+
+ def re_encode_by_rm_tokenizer(self, data: DataProto) -> DataProto:
+ assert self.use_different_tokenizer, 're-encode need rm tokenizer not be None!'
+ # need to use rm tokenizer to re-generate input_ids, attention_mask and position_ids
+ # 1. remove pad for each sequence
+ # 2. decode by sft_tokenizer, remove sft system prompts
+ # 3. encode by rm_tokenizer with rm system prompts, get rm_input_ids
+ # 4. generate attention_mask and position_ids
+ input_ids = data.batch['input_ids'] # (bs, seq_len)
+ attention_mask = data.batch['attention_mask']
+ position_ids = data.batch['position_ids']
+ ori_values = {'input_ids': input_ids, 'attention_mask': attention_mask, 'position_ids': position_ids}
+ ori_bs, ori_seqlen = input_ids.size(0), input_ids.size(1)
+ input_ids_for_rm = []
+ attention_mask_for_rm = []
+ position_ids_for_rm = []
+ print_decode = True
+ ori_seqlen = ori_seqlen + 128
+ for id, mask in zip(input_ids, attention_mask):
+ # 1. remove pad for each sequence
+ non_zero_indices = torch.nonzero(mask).view(-1)
+ begin_pos, end_pos = non_zero_indices[0].item(), non_zero_indices[-1].item()
+ valid_id = id[begin_pos:end_pos + 1]
+ # 2. decode by sft_tokenizer, remove sft system prompts
+ decode_result = self.sft_tokenizer.decode(valid_id)
+ # workaround
+ decode_with_rm_chat = decode_result.replace("<|user|>\n", "[INST] ").replace(
+ "\n<|assistant|>\n", " [/INST]").replace(" \n<|assistant|>\n", " [/INST]") + ""
+
+ print(f"decode_with_rm_chat: {decode_with_rm_chat}")
+
+ if print_decode and torch.distributed.get_rank() == 0:
+ # only print first decode result
+ print(f'device {torch.cuda.current_device()}: sft decode result:\n{decode_result}\n \
+ \ndevice {torch.cuda.current_device()}: sft decode result with rm chat template:\n{decode_with_rm_chat}\n\n'
+ )
+ print_decode = False
+ # 3. encode by rm_tokenizer
+ rm_input_ids = self.rm_tokenizer(decode_with_rm_chat,
+ return_tensors='pt')['input_ids'][0].to(input_ids.device)
+ # 4. generate attention_mask and position_ids
+ rm_attention_mask = torch.ones_like(rm_input_ids, device=input_ids.device)
+ cur_seqlen = rm_input_ids.shape[-1]
+ # NOTE(gh): the later reward compute will process the shape (bs, seqlen_pad_128)
+ if cur_seqlen > ori_seqlen:
+ print(f'warninig: rm encode seqlen {cur_seqlen} > sft encode seqlen {ori_seqlen}')
+ rm_input_ids = rm_input_ids[:ori_seqlen]
+ rm_attention_mask = rm_attention_mask[:ori_seqlen]
+ else:
+ # right padding
+ rm_input_ids = pad_sequence_to_length(rm_input_ids, ori_seqlen, self.rm_tokenizer.pad_token_id)
+ rm_attention_mask = pad_sequence_to_length(rm_attention_mask, ori_seqlen, 0)
+ rm_position_ids = torch.arange(0, ori_seqlen, device=input_ids.device)
+ input_ids_for_rm.append(torch.unsqueeze(rm_input_ids, dim=0))
+ attention_mask_for_rm.append(torch.unsqueeze(rm_attention_mask, dim=0))
+ position_ids_for_rm.append(torch.unsqueeze(rm_position_ids, dim=0))
+ input_ids_for_rm = torch.cat(input_ids_for_rm, dim=0)
+ attention_mask_for_rm = torch.cat(attention_mask_for_rm, dim=0)
+ position_ids_for_rm = torch.cat(position_ids_for_rm, dim=0)
+
+ # (bs, seqlen) will not change, but input_ids, attention_mask and position_ids will change
+ # NOTE(gh): need to replace into origin values after compute reward!
+ data.batch['input_ids'] = input_ids_for_rm
+ data.batch['attention_mask'] = attention_mask_for_rm
+ data.batch['position_ids'] = position_ids_for_rm
+
+ return data, ori_values
+
+ @torch.no_grad()
+ def compute_reward(self, data: DataProto) -> DataProto:
+ if self.config.param_offload:
+ self.load_params_to_cuda()
+
+ if self.use_different_tokenizer:
+ data, ori_values = self.re_encode_by_rm_tokenizer(data)
+
+ input_ids = data.batch['input_ids'] # (bs, seq_len')
+ attention_mask = data.batch['attention_mask']
+ position_ids = data.batch['position_ids']
+
+ responses = data.batch['responses']
+ batch_size = responses.size(0)
+ response_length = responses.size(1)
+
+ with torch.no_grad():
+ output = self.forward_batch(data)
+ if mpu.is_pipeline_last_stage(ignore_virtual=True):
+ logits = torch.cat([o['logits'] for o in output], dim=0)
+ else:
+ logits = torch.empty(
+ (input_ids.shape[0], input_ids.shape[1]),
+ dtype=torch.bfloat16, # TODO(sgm): check why is bfloat16
+ device=input_ids.device)
+ # broadcast across pp ranks
+ torch.distributed.broadcast(tensor=logits,
+ src=mpu.get_pipeline_model_parallel_last_rank(),
+ group=mpu.get_pipeline_model_parallel_group(),
+ async_op=False)
+
+ # (bs, seqlen', hidden_size) -> (bs, seqlen', 1) -> (bs, seqlen')
+ token_level_rewards = logits
+ # find the last token reward
+ ends = attention_mask.cumsum(dim=-1).argmax(dim=-1).view(-1, 1) # (bs, 1)
+ rewards = torch.gather(token_level_rewards, dim=1, index=ends) # (bs, 1)
+
+ if self.use_different_tokenizer:
+ data.batch.update(ori_values)
+ input_ids = ori_values['input_ids']
+ attention_mask = ori_values['attention_mask']
+ position_ids = ori_values['position_ids']
+
+ token_level_rewards = rewards.expand(attention_mask.shape[0], attention_mask.shape[1]) # (bs, ori_seqlen)
+
+ # assign last valid token reward to ori position
+ eos_mask_idx = torch.argmax(position_ids * attention_mask, dim=-1) # (bs,)
+ eos_mask = torch.zeros_like(attention_mask)
+ eos_mask[torch.arange(batch_size), eos_mask_idx] = 1.
+
+ token_level_rewards = token_level_rewards * eos_mask
+ token_level_rewards = token_level_rewards[:, -response_length:]
+
+ if self.config.param_offload:
+ self.offload_params_to_cpu()
+ else:
+ # add empty cache after each compute
+ torch.cuda.empty_cache()
+
+ batch = TensorDict({'rm_scores': token_level_rewards}, batch_size=input_ids.shape[0])
+
+ return DataProto(batch=batch)
+
+ def forward_batch(self, data: DataProto):
+ """
+ We assume:
+ - The model takes input: (input_ids, attention_mask, position_ids). No rmpad for the input
+ - The communication shape is (total_nnz_pad_to_sp // tp_size, 1, hidden_size) if sequence parallel is enabled
+ """
+ # broadcast from last pp rank to all other pp ranks
+ # TODO: actually, we just need to control the sampling order.
+ data.batch = data.batch.contiguous()
+ broadcast_dict_tensor(data.batch,
+ src=mpu.get_pipeline_model_parallel_last_rank(),
+ group=mpu.get_pipeline_model_parallel_group())
+
+ # split into micro-batches
+ if self.config is not None and 'ppo_micro_batch_size' in self.config:
+ infer_batch_size = self.config.ppo_micro_batch_size
+ else:
+ infer_batch_size = data.batch.batch_size[0]
+
+ data.batch['attention_mask'] = data.batch['attention_mask'].to(bool)
+ batches = split_dict_tensor_into_batches(data.batch, batch_size=infer_batch_size)
+ n_micro_batch = len(batches)
+ seq_len = batches[0]['input_ids'].shape[1]
+
+ # compute input shapes for pp stages
+ input_shapes = compute_transformers_input_shapes(
+ batches,
+ meta_info={
+ 'sequence_parallel': self.megatron_config.sequence_parallel,
+ 'hidden_size': self.model_config.hidden_size
+ })
+ # compute input shapes for pp stages
+ forward_backward_func = get_forward_backward_func()
+
+ def loss_func(output):
+ return 1., {'logits': output.logits}
+
+ def forward_step(batch_iter, model):
+ batch = next(batch_iter)
+ input_ids = batch['input_ids']
+ attention_mask = batch['attention_mask']
+ position_ids = batch['position_ids']
+ output = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)
+ return output, loss_func
+
+ # batch should be a list of batches inside micro-batches
+ batch_generator = make_batch_generator(batches, vpp_size=len(self.reward_model_module))
+
+ # TODO: we may use the new schedule instead
+ # for flash-attn: (seq_len, batch_size, hidden_size) = (mbs*seq_len, 1, hidden_size)
+ if mpu.get_pipeline_model_parallel_world_size() > 1:
+ losses_reduced = forward_backward_func(
+ forward_step_func=forward_step,
+ data_iterator=batch_generator,
+ model=self.reward_model_module,
+ num_microbatches=n_micro_batch,
+ input_shapes=input_shapes, # must set for flash-attn sequence packing
+ seq_length=infer_batch_size * seq_len, # no use when input_shapes was set
+ hidden_size=self.model_config.hidden_size, # no use when input_shapes was set
+ micro_batch_size=1, # no use when input_shapes was set
+ forward_only=True,
+ )
+ else:
+ losses_reduced = forward_backward_func(
+ forward_step_func=forward_step,
+ data_iterator=batch_generator,
+ model=self.reward_model_module,
+ num_microbatches=n_micro_batch,
+ seq_length=infer_batch_size * seq_len, # in use for pp = 1
+ hidden_size=self.model_config.hidden_size, # in use for pp = 1
+ micro_batch_size=1, # in use for pp = 1
+ forward_only=True,
+ )
+ # loss_reduces contains the stats returned from loss_func
+
+ return losses_reduced
+
+ def offload_params_to_cpu(self):
+ if self.device == 'cuda':
+ for reward_model_module in self.reward_model_module:
+ for name, param in reward_model_module.named_parameters():
+ param.data = param.data.to('cpu', non_blocking=True)
+ self.device = 'cpu'
+ torch.cuda.empty_cache()
+
+ def load_params_to_cuda(self):
+ if self.device == 'cpu':
+ for reward_model_module in self.reward_model_module:
+ for name, param in reward_model_module.named_parameters():
+ param.data = param.data.to(torch.cuda.current_device(), non_blocking=True)
+ self.device = 'cuda'
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..083848c77faafa61d2a449e23707431925fafb40
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseRollout
+from .naive import NaiveRollout
+from .hf_rollout import HFRollout
+
+__all__ = ["BaseRollout", "NaiveRollout", "HFRollout"]
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/base.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c2733325bbf7ba4e8c3438a53c4e2b97d60ee83
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/base.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Iterable, Union
+
+from verl import DataProto
+
+__all__ = ['BaseRollout']
+
+
+class BaseRollout(ABC):
+
+ def __init__(self):
+ """
+
+ Args:
+ dataloader: an Iterable of TensorDict that consistently generates prompts. Note that the dataloader
+ should handle when the training stops.
+ """
+ super().__init__()
+
+ @abstractmethod
+ def generate_sequences(self, prompts: DataProto) -> DataProto:
+ """Generate sequences"""
+ pass
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/hf_rollout.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/hf_rollout.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d929e5dd439a5c1a3b92b73bd6cb134cbb29f09
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/hf_rollout.py
@@ -0,0 +1,140 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Rollout with huggingface models.
+TODO: refactor this class. Currently, it will hang when using FSDP HybridShard. We should actually create a single GPU model.
+Then, get full state_dict and bind the state_dict to the single GPU model. Then, use the single GPU model to perform generation.
+"""
+import contextlib
+import torch
+import torch.distributed
+from tensordict import TensorDict
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+from verl import DataProto
+from verl.utils.torch_functional import get_eos_mask
+from .base import BaseRollout
+
+from transformers import GenerationConfig
+
+__all__ = ['HFRollout']
+
+
+class HFRollout(BaseRollout):
+
+ def __init__(self, module: nn.Module, config):
+ super().__init__()
+ self.config = config
+ self.module = module
+
+ def generate_sequences(self, prompts: DataProto) -> DataProto:
+ batch_size = prompts.batch.batch_size[0]
+ num_chunks = max(batch_size // self.config.get('micro_batch_size', batch_size), 1)
+ batch_prompts = prompts.chunk(chunks=num_chunks)
+ output = [self._generate_minibatch(p) for p in batch_prompts]
+ output = DataProto.concat(output)
+ return output
+
+ @torch.no_grad()
+ def _generate_minibatch(self, prompts: DataProto) -> DataProto:
+ idx = prompts.batch['input_ids'] # (bs, prompt_length)
+ attention_mask = prompts.batch['attention_mask'] # left-padded attention_mask
+ position_ids = prompts.batch['position_ids']
+
+ # used to construct attention_mask
+ eos_token_id = prompts.meta_info['eos_token_id']
+ pad_token_id = prompts.meta_info['pad_token_id']
+
+ batch_size = idx.size(0)
+ prompt_length = idx.size(1)
+
+ self.module.eval()
+ param_ctx = contextlib.nullcontext()
+
+ # make sampling args can be overriden by inputs
+ do_sample = prompts.meta_info.get('do_sample', self.config.do_sample)
+ response_length = prompts.meta_info.get('response_length', self.config.response_length)
+ top_p = prompts.meta_info.get('top_p', self.config.get('top_p', 1.0))
+ top_k = prompts.meta_info.get('top_k', self.config.get('top_k', 0))
+
+ if top_k is None:
+ top_k = 0
+ top_k = max(0, top_k) # to be compatible with vllm
+
+ temperature = prompts.meta_info.get('temperature', self.config.temperature)
+
+ generation_config = GenerationConfig(temperature=temperature, top_p=top_p, top_k=top_k)
+
+ if isinstance(self.module, FSDP):
+ # recurse need to set to False according to https://github.com/pytorch/pytorch/issues/100069
+ param_ctx = FSDP.summon_full_params(self.module, writeback=False, recurse=False)
+ with param_ctx:
+ with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+ output = self.module.generate(
+ input_ids=idx,
+ attention_mask=attention_mask,
+ do_sample=do_sample,
+ max_new_tokens=response_length,
+ # max_length=max_length,
+ eos_token_id=eos_token_id,
+ pad_token_id=pad_token_id,
+ generation_config=generation_config,
+ # renormalize_logits=True,
+ output_scores=False, # this is potentially very large
+ return_dict_in_generate=True,
+ use_cache=True)
+ # TODO: filter out the seq with no answers like ds-chat
+ seq = output.sequences
+
+ # huggingface generate will stop generating when all the batch reaches [EOS].
+ # We have to pad to response_length
+ sequence_length = prompt_length + self.config.response_length
+ delta_length = sequence_length - seq.shape[1]
+
+ if delta_length > 0:
+ delta_tokens = torch.ones(size=(batch_size, delta_length), device=seq.device, dtype=seq.dtype)
+ delta_tokens = pad_token_id * delta_tokens
+ seq = torch.cat((seq, delta_tokens), dim=1)
+
+ assert seq.shape[1] == sequence_length
+
+ prompt = seq[:, :prompt_length] # (bs, prompt_length)
+ response = seq[:, prompt_length:] # (bs, response_length)
+
+ response_length = response.size(1)
+ delta_position_id = torch.arange(1, response_length + 1, device=position_ids.device)
+ delta_position_id = delta_position_id.unsqueeze(0).repeat(batch_size, 1)
+
+ response_position_ids = position_ids[:, -1:] + delta_position_id
+ position_ids = torch.cat([position_ids, response_position_ids], dim=-1)
+
+ response_attention_mask = get_eos_mask(response_id=response, eos_token=eos_token_id, dtype=attention_mask.dtype)
+ attention_mask = torch.cat((attention_mask, response_attention_mask), dim=-1)
+
+ batch = TensorDict(
+ {
+ 'prompts': prompt,
+ 'responses': response,
+ 'input_ids': seq,
+ 'attention_mask': attention_mask,
+ 'position_ids': position_ids
+ },
+ batch_size=batch_size)
+
+ # empty cache before compute old_log_prob
+ torch.cuda.empty_cache()
+
+ self.module.train()
+ return DataProto(batch=batch)
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/naive/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/naive/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..df81c8603fc41731b2ec2cf007a06f5976e43c06
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/naive/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .naive_rollout import NaiveRollout
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/naive/naive_rollout.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/naive/naive_rollout.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f2e8d59b9c664912f9ce81e5410f667985f0726
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/naive/naive_rollout.py
@@ -0,0 +1,119 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+In single GPU rollout, the sequences are generated directly by sampling from the model.
+The output will contain
+1. output_ids
+2. attention_masks (left padding)
+3. eos_masks
+4. log_probs
+"""
+from typing import Iterable, Union
+
+import torch
+import torch.nn.functional as F
+from tensordict import TensorDict
+from torch import nn
+
+from verl import DataProto
+from verl.utils.torch_functional import logprobs_from_logits
+from ..base import BaseRollout
+
+__all__ = ['NativeRollout']
+
+
+class NaiveRollout(BaseRollout):
+
+ def __init__(self, module: nn.Module, config):
+ """A naive rollout. It requires the module to be compatible with huggingface APIs. That is:
+ The module should define __call__ to receive input_ids, attention_mask and position_ids.
+ It outputs a structure that contains logits field.
+
+ Args:
+ module: module here follows huggingface APIs
+ config: DictConfig
+ """
+ super().__init__()
+ self.config = config
+ self.module = module
+
+ @torch.no_grad()
+ def generate_sequences(self, prompts: DataProto) -> DataProto:
+ """Generate sequences"""
+ idx = prompts.batch['input_ids'] # (bs, prompt_length)
+ attention_mask = prompts.batch['attention_mask'] # left-padded attention_mask
+ position_ids = prompts.batch['position_ids']
+
+ # used to construct attention_mask
+ eos_token_id = prompts.meta_info['eos_token_id']
+
+ batch_size = idx.size(0)
+ prompt_length = idx.size(1)
+
+ self.module.eval()
+
+ prev_attention_mask = torch.ones(size=(batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device)
+
+ logits_lst = []
+ for _ in range(self.config.response_length):
+ # if the sequence context is growing too long we must crop it at block_size
+ # idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+ idx_cond = idx
+ # forward the model to get the logits for the index in the sequence
+ # we use huggingface APIs here
+ output = self.module(input_ids=idx_cond, attention_mask=attention_mask, position_ids=position_ids)
+ logits = output.logits
+ # pluck the logits at the final step and scale by desired temperature
+ logits = logits[:, -1, :] / self.config.temperature # (bs, vocab_size)
+ # optionally crop the logits to only the top k options
+ if self.config.top_k is not None:
+ v, _ = torch.topk(logits, min(self.config.top_k, logits.size(-1)))
+ logits[logits < v[:, [-1]]] = -float('Inf')
+ # apply softmax to convert logits to (normalized) probabilities
+ probs = F.softmax(logits, dim=-1)
+ # sample from the distribution
+ if self.config.do_sample:
+ idx_next = torch.multinomial(probs, num_samples=1)
+ else:
+ idx_next = torch.argmax(probs, dim=-1, keepdim=True)
+
+ attention_mask = torch.cat((attention_mask, prev_attention_mask), dim=-1)
+
+ prev_attention_mask = torch.logical_and(idx_next != eos_token_id, prev_attention_mask.bool())
+ prev_attention_mask.to(attention_mask.dtype)
+
+ position_ids = torch.cat((position_ids, position_ids[:, -1:] + 1), dim=-1)
+
+ # append sampled index to the running sequence and continue
+ idx = torch.cat((idx, idx_next), dim=1)
+ logits_lst.append(logits)
+
+ logits = torch.stack(logits_lst, dim=1) # (bs, response_length, vocab_size)
+ prompts = idx[:, :prompt_length] # (bs, prompt_length)
+ response = idx[:, prompt_length:] # (bs, response_length)
+ log_probs = logprobs_from_logits(logits=logits, labels=response)
+ batch = TensorDict(
+ {
+ 'input_ids': prompts,
+ 'responses': response,
+ 'sequences': idx,
+ 'old_log_probs': log_probs,
+ 'attention_mask': attention_mask,
+ 'position_ids': position_ids,
+ },
+ batch_size=batch_size)
+
+ self.module.train()
+
+ return DataProto(batch=batch)
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/tokenizer.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0dfa3a530329605d7af48a2186d304198774e09
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/tokenizer.py
@@ -0,0 +1,162 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The base tokenizer class, required for any hybrid engine based rollout or inference with vLLM.
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Union
+
+__all__ = ['HybridEngineBaseTokenizer']
+
+
+class HybridEngineBaseTokenizer(ABC):
+ """the tokenizer property and function name should align with HF's to meet vllm requirement"""
+
+ @property
+ @abstractmethod
+ def vocab_size(self):
+ """
+ `int`: Size of the base vocabulary (without the added tokens).
+ """
+ pass
+
+ @property
+ @abstractmethod
+ def pad_token_id(self):
+ """
+ `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
+ """
+ pass
+
+ @property
+ @abstractmethod
+ def eos_token_id(self):
+ """
+ `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
+ set.
+ """
+ pass
+
+ @property
+ @abstractmethod
+ def all_special_ids(self) -> List[int]:
+ """
+ `List[int]`: List the ids of the special tokens(`''`, `''`, etc.) mapped to class attributes.
+ """
+ pass
+
+ @property
+ @abstractmethod
+ def all_special_tokens(self) -> List[str]:
+ """
+ `List[str]`: A list of the unique special tokens (`''`, `''`, ..., etc.).
+
+ Convert tokens of `tokenizers.AddedToken` type to string.
+ """
+ pass
+
+ @abstractmethod
+ def encode(self, text):
+ """
+ Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
+
+ Args:
+ text (`str`, `List[str]` or `List[int]`):
+ The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+ `tokenize` method) or a list of integers.
+
+ text_pair (`str`, `List[str]` or `List[int]`, *optional*):
+ Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+ the `tokenize` method) or a list of integers.
+ """
+ pass
+
+ @abstractmethod
+ def decode(
+ self,
+ token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+ skip_special_tokens: bool = False,
+ clean_up_tokenization_spaces: bool = None,
+ **kwargs,
+ ) -> str:
+ """
+ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+ tokens and clean up tokenization spaces.
+
+ Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+ Args:
+ token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+ List of tokenized input ids. Can be obtained using the `__call__` method.
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not to remove special tokens in the decoding.
+ clean_up_tokenization_spaces (`bool`, *optional*):
+ Whether or not to clean up the tokenization spaces. If `None`, will default to
+ `self.clean_up_tokenization_spaces`.
+ kwargs (additional keyword arguments, *optional*):
+ Will be passed to the underlying model specific decode method.
+
+ Returns:
+ `str`: The decoded sentence.
+ """
+ pass
+
+ @abstractmethod
+ def convert_ids_to_tokens(self,
+ ids: Union[int, List[int]],
+ skip_special_tokens: bool = False) -> Union[str, List[str]]:
+ """
+ Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+ added tokens.
+
+ Args:
+ ids (`int` or `List[int]`):
+ The token id (or token ids) to convert to tokens.
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not to remove special tokens in the decoding.
+
+ Returns:
+ `str` or `List[str]`: The decoded token(s).
+ """
+ pass
+
+ @abstractmethod
+ def get_added_vocab(self) -> Dict[str, int]:
+ """
+ Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
+ the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
+ something we should change.
+
+ Returns:
+ `Dict[str, int]`: The added tokens.
+ """
+ pass
+
+ @abstractmethod
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
+ """
+ Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
+ often want to remove sub-word tokenization artifacts at the same time.
+
+ Args:
+ tokens (`List[str]`): The token to join in a string.
+
+ Returns:
+ `str`: The joined tokens.
+ """
+ pass
+
+ @property
+ def is_fast(self):
+ return False
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f06d209f9d7d58c5aa41efad7cd237164a9fb8b
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .vllm_rollout import vLLMRollout
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/vllm_rollout.py b/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/vllm_rollout.py
new file mode 100644
index 0000000000000000000000000000000000000000..947d558fb1910c09a61ec0c81087815d92d16f94
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/rollout/vllm_rollout/vllm_rollout.py
@@ -0,0 +1,226 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The vllm_rollout that can be applied in different backend
+When working with FSDP:
+- Use DTensor weight loader (recommended) or HF weight loader
+- Utilize state_dict from the FSDP to synchronize the weights among tp ranks in vLLM
+When working with Megatron:
+- Use Megatron weight loader
+- During training, only the current pp stage holds the parameters
+- Before inference, broadcast the parameters of the current pp rank to all other pp ranks (all pp ranks holds all the parameters)
+- Bind the parameters to the inference engine
+- Do inference in tp. pp is treated as additional dp
+- After inference, all the parameters that doesn't belong to this pp rank is freed.
+"""
+from typing import List
+from contextlib import contextmanager
+from omegaconf import DictConfig
+import torch
+import torch.distributed
+from tensordict import TensorDict
+from torch import nn
+
+from verl import DataProto
+from verl.utils.torch_functional import get_eos_mask, pad_sequence_to_length
+from verl.workers.rollout.base import BaseRollout
+from verl.third_party.vllm import LLM, vllm_version
+from verl.third_party.vllm import parallel_state as vllm_ps
+from vllm import SamplingParams
+
+# TODO
+# 1. support pp in vllm
+# 2. passing tokenizer is not necessary? no encoding/decoding is happending here
+# 3. simplify init logics
+
+
+# NOTE(sgm): add for verl. We can optimize it by making the dataloader yield List[int] without padding.
+def _pre_process_inputs(pad_token_id, prompt_token_ids: torch.Tensor) -> List[int]:
+ # remove the left padding in the prompt token_id
+ # pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
+ non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
+ token_ids = prompt_token_ids[non_pad_index:].tolist()
+ return token_ids
+
+
+class vLLMRollout(BaseRollout):
+
+ def __init__(self, actor_module: nn.Module, config: DictConfig, tokenizer, model_hf_config, **kwargs):
+ """A vLLM rollout. It requires the module is supported by the vllm.
+
+ Args:
+ module: module here follows huggingface APIs
+ config: DictConfig
+ tokenizer: the task/model tokenizer
+ model_hf_config: the huggingface config to initiallize the generating model in vllm
+ **kwargs: train_tp, for Megatron Backend to initialize hybrid engine (zero redundancy) process group
+ """
+ super().__init__()
+ self.config = config
+ assert not (not config.enforce_eager and config.free_cache_engine), \
+ "disable CUDA graph (enforce_eager = False) if free cache engine"
+
+ tensor_parallel_size = self.config.get('tensor_model_parallel_size', 1)
+ assert tensor_parallel_size <= torch.distributed.get_world_size(), \
+ "tensor parallel size should be less than or equal to the world size"
+
+ if kwargs.get('train_tp', None) is not None:
+ # deployed with megatron
+ import os
+ os.environ['CUDA_TIMER_STREAM_KAFKA_ENABLE'] = '0'
+ os.environ['MEGATRON_IMPORT_TIMERS'] = '0'
+ train_tp = kwargs.get('train_tp', None)
+ num_tp_per_train_tp = train_tp // tensor_parallel_size
+ if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
+ vllm_ps.initialize_parallel_state(tensor_model_parallel_size=tensor_parallel_size,
+ num_tp_per_train_tp=num_tp_per_train_tp)
+
+ assert model_hf_config.max_position_embeddings >= config.prompt_length + config.response_length, \
+ "model context length should be greater than total sequence length"
+ self.inference_engine = LLM(actor_module,
+ tokenizer=tokenizer,
+ model_hf_config=model_hf_config,
+ tensor_parallel_size=tensor_parallel_size,
+ dtype=config.dtype,
+ enforce_eager=config.enforce_eager,
+ gpu_memory_utilization=config.gpu_memory_utilization,
+ skip_tokenizer_init=False,
+ max_model_len=config.prompt_length + config.response_length,
+ load_format=config.load_format)
+
+ # Offload vllm model to reduce peak memory usage
+ self.inference_engine.offload_model_weights()
+
+ kwargs = dict(
+ n=1,
+ logprobs=1, # can be set to 0 and let actor to recompute
+ max_tokens=config.response_length,
+ )
+
+ # we may detokenize the result all together later
+ if vllm_version in ('0.4.2', '0.5.4', '0.6.3'):
+ kwargs['detokenize'] = False
+
+ # supporting adding any sampling params from the config file
+ for k in config.keys():
+ if hasattr(SamplingParams(), str(k)):
+ kwargs[k] = config.get(k)
+
+ print(f"kwargs: {kwargs}")
+ self.sampling_params = SamplingParams(**kwargs)
+
+ self.pad_token_id = tokenizer.pad_token_id
+
+ @contextmanager
+ def update_sampling_params(self, **kwargs):
+ # update sampling params
+ old_sampling_params_args = {}
+ if kwargs:
+ for key, value in kwargs.items():
+ if hasattr(self.sampling_params, key):
+ old_value = getattr(self.sampling_params, key)
+ old_sampling_params_args[key] = old_value
+ setattr(self.sampling_params, key, value)
+ yield
+ # roll back to previous sampling params
+ # if len(old_sampling_params_args):
+ for key, value in old_sampling_params_args.items():
+ setattr(self.sampling_params, key, value)
+
+ @torch.no_grad()
+ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
+ # rebuild vllm cache engine
+ if self.config.free_cache_engine:
+ self.inference_engine.init_cache_engine()
+
+ idx = prompts.batch['input_ids'] # (bs, prompt_length)
+ # left-padded attention_mask
+ attention_mask = prompts.batch['attention_mask']
+ position_ids = prompts.batch['position_ids']
+
+ # used to construct attention_mask
+ eos_token_id = prompts.meta_info['eos_token_id']
+
+ batch_size = idx.size(0)
+
+ idx_list = []
+ # parse idx from torch.Tensor to List[List[str]]
+ for i in range(batch_size):
+ idx_list.append(_pre_process_inputs(self.pad_token_id, idx[i]))
+
+ do_sample = prompts.meta_info.get('do_sample', True)
+ if not do_sample:
+ kwargs = {
+ 'best_of': 1,
+ 'top_p': 1.0,
+ 'top_k': -1,
+ 'min_p': 0.0,
+ 'temperature': 0,
+ 'n': 1 # if greedy, only 1 response
+ }
+
+ # users can customize different sampling_params at different run
+ with self.update_sampling_params(**kwargs):
+ output = self.inference_engine.generate(
+ prompts=None, # because we have already convert it to prompt token id
+ sampling_params=self.sampling_params,
+ prompt_token_ids=idx_list,
+ use_tqdm=False)
+
+ # TODO(sgm): disable logprob when recompute_log_prob is enable
+ # if n = 1: (bs, response_length) ; if n > 1: (bs * n, response_length)
+ response = output[0].to(idx.device)
+ log_probs = output[1].to(idx.device)
+
+ if response.shape[1] < self.config.response_length:
+ response = pad_sequence_to_length(response, self.config.response_length, self.pad_token_id)
+ log_probs = pad_sequence_to_length(log_probs, self.config.response_length, self.pad_token_id)
+
+ if self.config.n > 1 and do_sample:
+ idx = idx.repeat_interleave(self.config.n, dim=0)
+ attention_mask = attention_mask.repeat_interleave(self.config.n, dim=0)
+ position_ids = position_ids.repeat_interleave(self.config.n, dim=0)
+ batch_size = batch_size * self.config.n
+ seq = torch.cat([idx, response], dim=-1)
+
+ response_length = response.size(1)
+ delta_position_id = torch.arange(1, response_length + 1, device=position_ids.device)
+ delta_position_id = delta_position_id.unsqueeze(0).repeat(batch_size, 1)
+
+ # TODO(sgm): fix position_ids on right_pad
+ # prompt: left pad + response: right pad
+ # attention_mask: [0,0,0,0,1,1,1,1, | 1,1,1,0,0,0,0,0]
+ # position_ids: [0,0,0,0,0,1,2,3, | 4,5,6,7,8,9,10,11]
+ response_position_ids = position_ids[:, -1:] + delta_position_id
+ position_ids = torch.cat([position_ids, response_position_ids], dim=-1)
+ response_attention_mask = get_eos_mask(response_id=response, eos_token=eos_token_id, dtype=attention_mask.dtype)
+ attention_mask = torch.cat((attention_mask, response_attention_mask), dim=-1)
+
+ # all the tp ranks should contain the same data here. data in all ranks are valid
+ batch = TensorDict(
+ {
+ 'prompts': idx,
+ 'responses': response,
+ 'input_ids': seq, # here input_ids become the whole sentences
+ # 'old_log_probs': log_probs, # we will recompute old log prob with actor
+ 'attention_mask': attention_mask,
+ 'position_ids': position_ids
+ },
+ batch_size=batch_size)
+
+ # free vllm cache engine
+ if self.config.free_cache_engine:
+ self.inference_engine.free_cache_engine()
+
+ return DataProto(batch=batch)
diff --git a/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/__init__.py b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e72fdf011c2455d920d0857eb3e6eadbaeebc332
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from verl.utils.import_utils import is_vllm_available, is_megatron_core_available
+
+from .base import BaseShardingManager
+from .fsdp_ulysses import FSDPUlyssesShardingManager
+
+AllGatherPPModel = None
+
+if is_megatron_core_available() and is_vllm_available():
+ from .megatron_vllm import AllGatherPPModel, MegatronVLLMShardingManager
+elif AllGatherPPModel is not None:
+ pass
+else:
+ AllGatherPPModel = None
+ MegatronVLLMShardingManager = None
+
+if is_vllm_available():
+ from .fsdp_vllm import FSDPVLLMShardingManager
+else:
+ FSDPVLLMShardingManager = None
diff --git a/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/base.py b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8717890f2e2cf4d2c5e7683398e32fa8ebf3765
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/base.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Sharding manager to implement HybridEngine
+"""
+
+from verl import DataProto
+
+
+class BaseShardingManager:
+
+ def __enter__(self):
+ pass
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ pass
+
+ def preprocess_data(self, data: DataProto) -> DataProto:
+ return data
+
+ def postprocess_data(self, data: DataProto) -> DataProto:
+ return data
diff --git a/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/fsdp_ulysses.py b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/fsdp_ulysses.py
new file mode 100644
index 0000000000000000000000000000000000000000..3969a6fc519c7b5f46ff57c29f57605d0d184e00
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/fsdp_ulysses.py
@@ -0,0 +1,88 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contains a resharding manager that binds weights from FSDP zero3 to XPerfGPT
+"""
+from typing import Optional
+from .base import BaseShardingManager
+
+import random
+from torch.distributed.device_mesh import DeviceMesh
+
+from verl.utils.torch_functional import allgather_dict_tensors
+from verl.utils.ulysses import set_ulysses_sequence_parallel_group, get_ulysses_sequence_parallel_group
+import numpy as np
+
+import torch
+import torch.distributed
+
+from verl import DataProto
+
+
+class FSDPUlyssesShardingManager(BaseShardingManager):
+ """
+ Sharding manager to support data resharding when using FSDP + Ulysses
+ """
+
+ def __init__(self, device_mesh: DeviceMesh):
+ super().__init__()
+ self.device_mesh = device_mesh
+ self.seed_offset = 12345
+
+ def __enter__(self):
+ if self.device_mesh is not None:
+ # We have a global SP group
+ # so we have to change to use model-specific sp group
+ self.prev_sp_group = get_ulysses_sequence_parallel_group()
+ set_ulysses_sequence_parallel_group(self.device_mesh['sp'].get_group())
+ # TODO: check how to set seed for each model
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ # restore random states
+ if self.device_mesh is not None:
+ # revert to previous sp group
+ set_ulysses_sequence_parallel_group(self.prev_sp_group)
+ # TODO: check how to set seed for each model
+
+ def preprocess_data(self, data: DataProto) -> DataProto:
+ """
+ AllGather data from sp region
+ This is because the data is first sharded along the FSDP dimension as we utilize the DP_COMPUTE
+ In Ulysses, we need to make sure the same data is used across a SP group
+ """
+ if self.device_mesh is not None:
+ sp_size = self.device_mesh['sp'].size()
+ group = self.device_mesh['sp'].get_group()
+
+ prev_device = data.batch.device
+ data.batch = data.batch.cuda(device=torch.cuda.current_device())
+ data.batch = allgather_dict_tensors(data.batch.contiguous(), size=sp_size, group=group, dim=0)
+ data.batch = data.batch.to(prev_device)
+ # all gather non_tensor_batch
+ all_non_tensor_batch = [None for _ in range(sp_size)]
+ torch.distributed.all_gather_object(all_non_tensor_batch, data.non_tensor_batch, group=group)
+ data.non_tensor_batch = {
+ k: np.concatenate([d[k] for d in all_non_tensor_batch]) for k in data.non_tensor_batch
+ }
+ return data
+
+ def postprocess_data(self, data: DataProto) -> DataProto:
+ """
+ Split the data to follow FSDP partition
+ """
+ if self.device_mesh is not None:
+ sp_size = self.device_mesh['sp'].size()
+ sp_rank = self.device_mesh['sp'].get_local_rank()
+ data = data.chunk(chunks=sp_size)[sp_rank]
+ return data
\ No newline at end of file
diff --git a/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/fsdp_vllm.py b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/fsdp_vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..19490f4ea50d50a6ca885bd07da4e3dc4f74e954
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/fsdp_vllm.py
@@ -0,0 +1,133 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+import torch
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.api import ShardingStrategy, ShardedStateDictConfig, StateDictType, FullStateDictConfig
+from torch.distributed.device_mesh import DeviceMesh
+
+from verl.third_party.vllm import LLM
+from verl.third_party.vllm import parallel_state as vllm_ps
+from verl import DataProto
+from verl.utils.torch_functional import (broadcast_dict_tensor, allgather_dict_tensors)
+from verl.utils.debug import log_gpu_memory_usage
+
+from .base import BaseShardingManager
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN'))
+
+
+class FSDPVLLMShardingManager(BaseShardingManager):
+
+ def __init__(self,
+ module: FSDP,
+ inference_engine: LLM,
+ model_config,
+ full_params: bool = False,
+ device_mesh: DeviceMesh = None):
+ self.module = module
+ self.inference_engine = inference_engine
+ self.model_config = model_config
+ self.device_mesh = device_mesh
+
+ # Full params
+ self.full_params = full_params
+ if full_params:
+ FSDP.set_state_dict_type(self.module,
+ state_dict_type=StateDictType.FULL_STATE_DICT,
+ state_dict_config=FullStateDictConfig())
+ else:
+ FSDP.set_state_dict_type(self.module,
+ state_dict_type=StateDictType.SHARDED_STATE_DICT,
+ state_dict_config=ShardedStateDictConfig())
+
+ # Note that torch_random_states may be different on each dp rank
+ self.torch_random_states = torch.cuda.get_rng_state()
+ # get a random rng states
+ if self.device_mesh is not None:
+ gen_dp_rank = self.device_mesh['dp'].get_local_rank()
+ torch.cuda.manual_seed(gen_dp_rank + 1000) # make sure all tp ranks have the same random states
+ self.gen_random_states = torch.cuda.get_rng_state()
+ torch.cuda.set_rng_state(self.torch_random_states)
+ else:
+ self.gen_random_states = None
+
+ def __enter__(self):
+ log_gpu_memory_usage('Before state_dict() in sharding manager memory', logger=logger)
+ params = self.module.state_dict()
+ log_gpu_memory_usage('After state_dict() in sharding manager memory', logger=logger)
+ # Copy, not share memory
+ load_format = 'hf' if self.full_params else 'dtensor'
+ self.inference_engine.sync_model_weights(params, load_format=load_format)
+ log_gpu_memory_usage('After sync model weights in sharding manager', logger=logger)
+
+ del params
+ torch.cuda.empty_cache()
+ log_gpu_memory_usage('After del state_dict and empty_cache in sharding manager', logger=logger)
+
+ # TODO: offload FSDP model weights
+ # self.module.cpu()
+ # torch.cuda.empty_cache()
+ # if torch.distributed.get_rank() == 0:
+ # print(f'after model to cpu in sharding manager memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB')
+
+ # important: need to manually set the random states of each tp to be identical.
+ if self.device_mesh is not None:
+ self.torch_random_states = torch.cuda.get_rng_state()
+ torch.cuda.set_rng_state(self.gen_random_states)
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ log_gpu_memory_usage('Before vllm offload in sharding manager', logger=logger)
+ self.inference_engine.offload_model_weights()
+ log_gpu_memory_usage('After vllm offload in sharding manager', logger=logger)
+
+ # self.module.to('cuda')
+ # if torch.distributed.get_rank() == 0:
+ # print(f'after actor module to cuda in sharding manager memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB')
+
+ self.module.train()
+
+ # add empty cache after each compute
+ torch.cuda.empty_cache()
+
+ # restore random states
+ if self.device_mesh is not None:
+ self.gen_random_states = torch.cuda.get_rng_state()
+ torch.cuda.set_rng_state(self.torch_random_states)
+
+ def preprocess_data(self, data: DataProto) -> DataProto:
+ # TODO: Current impl doesn't consider FSDP with torch micro-dp
+ data.batch = allgather_dict_tensors(data.batch.contiguous(),
+ size=vllm_ps.get_tensor_model_parallel_world_size(),
+ group=vllm_ps.get_tensor_model_parallel_group(),
+ dim=0)
+
+ return data
+
+ def postprocess_data(self, data: DataProto) -> DataProto:
+ # TODO: Current impl doesn't consider FSDP with torch micro-dp
+ broadcast_dict_tensor(data.batch,
+ src=vllm_ps.get_tensor_model_parallel_src_rank(),
+ group=vllm_ps.get_tensor_model_parallel_group())
+ dp_rank = torch.distributed.get_rank()
+ dp_size = torch.distributed.get_world_size() # not consider torch micro-dp
+ tp_size = vllm_ps.get_tensor_model_parallel_world_size()
+ if tp_size > 1:
+ # TODO: shall we build a micro_dp group for vllm when integrating with vLLM?
+ local_prompts = data.chunk(chunks=tp_size)
+ data = local_prompts[dp_rank % tp_size]
+ return data
diff --git a/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/megatron_vllm.py b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/megatron_vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc07a5a656445f4ea442440b8634422e1b836ce0
--- /dev/null
+++ b/code/RL_model/verl/Search-R1/verl/workers/sharding_manager/megatron_vllm.py
@@ -0,0 +1,428 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains a Megatron style Hybrid Engine that shares the weights of the actor with the inference engine.
+"""
+
+import torch
+import torch.distributed as dist
+
+from torch import nn
+
+from megatron.core import parallel_state as mpu
+from megatron.core import DistributedDataParallel as LocalDDP
+from megatron.core.transformer.module import Float16Module
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from verl.utils.megatron_utils import get_model, unwrap_model
+from verl.utils.memory_buffer import (
+ build_memory_buffer,
+ build_memory_reference_from_module,
+ get_weight_buffer_meta_from_module,
+)
+
+
+class AllGatherPPModel:
+
+ def __init__(self, model_provider) -> None:
+
+ self._pp_group = mpu.get_pipeline_model_parallel_group()
+ self._pp_rank = mpu.get_pipeline_model_parallel_rank()
+ self._pp_size = mpu.get_pipeline_model_parallel_world_size()
+ self._vpp_size = mpu.get_virtual_pipeline_model_parallel_world_size()
+ self._model_chunk_size = self._vpp_size or 1
+
+ # each one holds a list of model_chunks in this pp stage
+ self._pp_models = [None] * self.pp_size
+
+ rank_list = list(range(self.pp_size))
+ # make current rank the last one to initialize
+ rank_list[self.pp_rank], rank_list[-1] = rank_list[-1], rank_list[self.pp_rank]
+ self._this_rank_models = None
+
+ # store the parameter of each pp stage
+ self.memory_buffers = [None] * self.pp_size
+ for cur_pp_rank in rank_list:
+ print(
+ f'create pp model', f'torch allocated {torch.cuda.memory_allocated() / 1e9:.4f} GB, '
+ f'reserved {torch.cuda.memory_reserved() / 1e9:.4f} GB')
+ # since the last initialized rank is the current pp rank, after init, the pp rank is still correct
+ mpu.set_pipeline_model_parallel_rank(cur_pp_rank)
+ if cur_pp_rank != self.pp_rank:
+ models = get_model(model_provider, wrap_with_ddp=False)
+ models = nn.ModuleList(models)
+ assert len(models) == self._model_chunk_size, f"{len(models)} != {self._model_chunk_size}"
+ self.pp_models[cur_pp_rank] = models
+ else:
+ # for regular model, we wrapped it with DDP
+ models = get_model(model_provider)
+ assert len(models) == self._model_chunk_size, f"{len(models)} != {self._model_chunk_size}"
+ self._this_rank_models = nn.ModuleList(models)
+ self.pp_models[cur_pp_rank] = nn.ModuleList(unwrap_model(models, (torchDDP, LocalDDP)))
+
+ self._build_param_buffer(cur_pp_rank)
+ self._build_param_references(cur_pp_rank, maintain_weight=cur_pp_rank == self.pp_rank)
+
+ # TODO: after binding to the memory buffer, we can load the checkpoint here
+ if cur_pp_rank != self.pp_rank:
+ for model in self.pp_models[cur_pp_rank]:
+ model.eval()
+ self._offload_params_to_cpu(cur_pp_rank)
+
+ def _build_param_buffer(self, pp_rank):
+ """Build the parameter buffer in each pp rank"""
+ model = self.pp_models[pp_rank]
+ weight_buffer_meta = get_weight_buffer_meta_from_module(model)
+ self.memory_buffers[pp_rank] = build_memory_buffer(weight_buffer_meta)
+
+ def _build_param_references(self, pp_rank, maintain_weight=False):
+ model = self.pp_models[pp_rank]
+ build_memory_reference_from_module(model, self.memory_buffers[pp_rank], maintain_weight=maintain_weight)
+
+ def _load_params_to_cuda(self, pp_rank, to_empty=False):
+ assert pp_rank != self.pp_rank, f"unexpected to load current pp rank [{pp_rank}] back to cuda"
+ for buffer in self.memory_buffers[pp_rank].values():
+ if not to_empty:
+ buffer.data = buffer.data.to(torch.cuda.current_device(), non_blocking=True)
+ else:
+ buffer.data = torch.empty_like(buffer.data, device='cuda')
+ # rebuild reference after loading to CUDA
+ self._build_param_references(pp_rank)
+
+ def _offload_params_to_cpu(self, pp_rank, to_empty=False):
+ assert pp_rank != self.pp_rank, f"unexpected to offload current pp rank [{pp_rank}] to cpu"
+ for buffer in self.memory_buffers[pp_rank].values():
+ if not to_empty:
+ # offload the whole memory buffer to CPU
+ buffer.data = buffer.data.to('cpu', non_blocking=True)
+ else:
+ buffer.data = torch.empty_like(buffer.data, device='cpu')
+ self._build_param_references(pp_rank)
+
+ def load_params_to_cuda(self, to_empty=False):
+ """load all model params to cuda"""
+ for cur_pp_rank in range(self.pp_size):
+ if cur_pp_rank != self.pp_rank:
+ self._load_params_to_cuda(cur_pp_rank, to_empty=to_empty)
+
+ def allgather_params(self):
+ """allgather params of all pp ranks. Return a list of handles"""
+ for cur_pp_rank in range(self.pp_size):
+ global_src = dist.get_global_rank(group=self.pp_group, group_rank=cur_pp_rank)
+
+ # NOTE(sgm): the async op may cause memory leakage of the memory_buffer/pp_models
+ for memory_buffer in self.memory_buffers[cur_pp_rank].values():
+ dist.broadcast(tensor=memory_buffer.data, src=global_src, group=self.pp_group, async_op=False)
+
+ def forward(self, *inputs, **kwargs):
+ try:
+ prev_output = None
+ for cur_chunk_rank in range(self._model_chunk_size):
+ if self._vpp_size:
+ mpu.set_virtual_pipeline_model_parallel_rank(cur_chunk_rank)
+
+ for cur_pp_rank in range(self.pp_size):
+ mpu.set_pipeline_model_parallel_rank(cur_pp_rank)
+ self.pp_models[cur_pp_rank][cur_chunk_rank].set_input_tensor(prev_output)
+ ret = self.pp_models[cur_pp_rank][cur_chunk_rank](*inputs, **kwargs)
+ self.pp_models[cur_pp_rank][cur_chunk_rank].set_input_tensor(None)
+ prev_output = ret
+ finally:
+ if self._vpp_size:
+ mpu.set_virtual_pipeline_model_parallel_rank(0)
+ mpu.set_pipeline_model_parallel_rank(self.pp_rank)
+ return ret
+
+ def __call__(self, *inputs, **kwargs):
+ return self.forward(*inputs, **kwargs)
+
+ def eval(self):
+ for model in self.pp_models[self.pp_rank]:
+ model.eval()
+
+ def train(self):
+ for model in self.pp_models[self.pp_rank]:
+ model.train()
+
+ def offload_params_to_cpu(self, to_empty=False):
+ """offload params of models that are not of current pp rank to cpu"""
+ for cur_pp_rank in range(self.pp_size):
+ if cur_pp_rank != self.pp_rank:
+ self._offload_params_to_cpu(cur_pp_rank, to_empty=to_empty)
+
+ def get_all_params(self):
+ """Get all the parameters of the models in all pp ranks
+
+ Returns:
+ params: List[List[Dict[str, Tensor]]]: a list of parameters in all pp, where each is a list of dict
+ tensors of each model chunk
+
+ """
+ params = []
+ for pp_rank in range(self.pp_size):
+ params.append([])
+ for model_chunk_idx in range(len(self.pp_models[pp_rank])):
+ params[pp_rank].append({})
+ pp_model = self.pp_models[pp_rank][model_chunk_idx]
+ pp_model = unwrap_model(pp_model, ((torchDDP, LocalDDP, Float16Module))) # not use Float16Module
+ for name, param in pp_model.named_parameters():
+ # NOTE(gh) workaround: should not get lora params for inference
+ if 'lora' in name:
+ continue
+ params[pp_rank][model_chunk_idx][name] = param
+
+ return params
+
+ def update_this_rank_models(self, new_models):
+ self._this_rank_models = new_models
+ self._pp_models[self.pp_rank] = unwrap_model(new_models, (torchDDP, LocalDDP))
+
+ @property
+ def this_rank_models(self):
+ return self._this_rank_models
+
+ @property
+ def pp_size(self):
+ return self._pp_size
+
+ @property
+ def pp_rank(self):
+ return self._pp_rank
+
+ @property
+ def pp_group(self):
+ return self._pp_group
+
+ @property
+ def pp_models(self):
+ return self._pp_models
+
+
+"""
+Megatron Hybrid Engine:
+- During training, only the current pp stage holds the parameters
+- Before inference, broadcast the parameters of the current pp rank to all other pp ranks (all pp ranks holds all the parameters)
+- Bind the parameters to the inference engine
+- Do inference in tp. pp is treated as additional dp
+- After inference, all the parameters that doesn't belong to this pp rank is freed.
+"""
+
+from .base import BaseShardingManager
+
+import torch
+from torch import nn
+import torch.distributed
+from torch.distributed import new_group
+
+from verl import DataProto
+from verl.utils.torch_functional import (broadcast_dict_tensor, allgather_dict_tensors)
+import verl.utils.megatron.tensor_parallel as tp_utils
+from verl.third_party.vllm import parallel_state as vllm_ps
+from verl.third_party.vllm import LLM
+from verl.utils.model import normalize_pp_vpp_params
+# Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp
+# into infer_tp and micro_tp. By default, we use order micro_dp - tp
+_MICRO_DATA_PARALLEL_GROUP = None
+
+
+class MegatronVLLMShardingManager(BaseShardingManager):
+
+ def __init__(self, module: AllGatherPPModel, inference_engine: LLM, model_config, layer_name_mapping):
+ self.module = module
+ self.inference_engine = inference_engine
+ self.model_config = model_config
+ self.layer_name_mapping = layer_name_mapping
+
+ # initialize micro_dp group for vllm inference
+ global _MICRO_DATA_PARALLEL_GROUP
+ world_size = torch.distributed.get_world_size()
+ rank = torch.distributed.get_rank()
+ train_tensor_parallel_size = mpu.get_tensor_model_parallel_world_size()
+ infer_tensor_parallel_size = vllm_ps.get_tensor_model_parallel_world_size()
+
+ # TODO(sgm): this may not be true for FSDP -> vLLM
+ assert infer_tensor_parallel_size <= train_tensor_parallel_size, \
+ 'Not implemented for infer_tp > train_tp'
+ assert train_tensor_parallel_size % infer_tensor_parallel_size == 0
+
+ micro_dp_size = train_tensor_parallel_size // infer_tensor_parallel_size
+ num_micro_dp_groups = world_size // micro_dp_size
+ assert _MICRO_DATA_PARALLEL_GROUP is None, ("micro data parallel group is already initialized")
+ for i in range(num_micro_dp_groups):
+ ranks = range(i * micro_dp_size, (i + 1) * micro_dp_size)
+ group = new_group(ranks=ranks)
+ if rank in ranks:
+ _MICRO_DATA_PARALLEL_GROUP = group
+
+ def default_tp_concat_fn(self, name, param, infer_params, model_config):
+ """
+ name: name of the parameter
+ param: training parameters
+ infer_params (List[torch.Tensor]): a list of parameters all-gathered from micro_dp_group
+ model_config: huggingface model_config
+ TODO(zhangchi.usc1992): currently, the implementation is adhoc. We can move this function to the model
+ definition so that it is model-agnostic. If the model doesn't implement this function,
+ we can throw an error to force user disable TP HybridEngine.
+ """
+
+ if self.layer_name_mapping.get("qkv_layer_name") in name:
+ # if the tensor is qkv, for each param on tp, split into q, k, v
+ # concat q, k, v separately.
+ q_lst = []
+ k_lst = []
+ v_lst = []
+ assert model_config.num_attention_heads % model_config.num_key_value_heads == 0
+ num_q_per_kv = model_config.num_attention_heads // model_config.num_key_value_heads
+ assert infer_params[0].shape[0] % (num_q_per_kv + 2) == 0
+ kv_size_per_tp = infer_params[0].shape[0] // (num_q_per_kv + 2)
+ split_size = [kv_size_per_tp * num_q_per_kv, kv_size_per_tp, kv_size_per_tp]
+ for infer_param in infer_params:
+ q, k, v = infer_param.split(split_size)
+ q_lst.append(q)
+ k_lst.append(k)
+ v_lst.append(v)
+ q = torch.cat(q_lst, dim=0)
+ k = torch.cat(k_lst, dim=0)
+ v = torch.cat(v_lst, dim=0)
+
+ infer_params = torch.cat((q, k, v), dim=0)
+
+ elif self.layer_name_mapping.get("gate_proj_layer_name") in name:
+ # if the tensor is gate and proj
+ gate_lst = []
+ up_lst = []
+ for infer_param in infer_params:
+ gate, up = infer_param.chunk(2)
+ gate_lst.append(gate)
+ up_lst.append(up)
+ gate = torch.cat(gate_lst, dim=0)
+ up = torch.cat(up_lst, dim=0)
+ infer_params = torch.cat((gate, up), dim=0)
+
+ else:
+ # concat tensor
+ infer_params = torch.cat(infer_params, dim=tp_utils.get_tensor_parallel_partition_dim(param))
+
+ return infer_params
+
+ def _post_process_params(self, params):
+ """
+ For each param, if it is a tp-splited param, we all-gather from micro_dp group.
+ """
+ # here the params are in train tp format. we iterate params and all-gather
+ # TODO(zhangchi.usc1992) We can consider copy non-tp weight to another infer buffer.
+ # In this way, all the params in the original memory_buffers and can be offload.
+ micro_dp_size = get_micro_data_parallel_world_size()
+ micro_dp_group = get_micro_data_parallel_group()
+
+ if micro_dp_size <= 1:
+ return
+
+ origin_params = {}
+ for name in params.keys():
+ param = params[name]
+ if tp_utils.is_tensor_parallel_param(param):
+ # allocate a new tensor with proper size
+ infer_params = [torch.empty_like(param) for _ in range(micro_dp_size)]
+ torch.distributed.all_gather(infer_params, param, group=micro_dp_group)
+ infer_params = self.default_tp_concat_fn(name, param, infer_params, self.model_config)
+ # replace with original param
+ params[name] = infer_params
+ origin_params[name] = param
+
+ return origin_params
+
+ def __enter__(self):
+ # create a new cuda space for parameters not in this pp rank
+ self.module.load_params_to_cuda()
+ # broadcast the parameters from pp rank to other ranks
+ self.module.allgather_params()
+ # obtain name to parameters in pp/vpp
+ params = self.module.get_all_params()
+
+ # bind the params to inference engine
+ self.params = normalize_pp_vpp_params(params=params,
+ num_hidden_layers=self.model_config.num_hidden_layers,
+ layer_name='layers')
+ self.origin_params = self._post_process_params(self.params)
+ self.inference_engine.sync_model_weights(self.params, load_format='megatron')
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ # offload parameters doesn't belong to this pp rank
+ self.module.offload_params_to_cpu()
+
+ # FIXME(sgm): the best practice is to delete the cuda tensor
+ # rebind the model weights, can be any cpu tensor
+ if get_micro_data_parallel_world_size() > 1:
+ for name in self.params.keys():
+ self.params[name] = self.origin_params[name]
+
+ # self.inference_engine.sync_model_weights(params)
+ self.inference_engine.offload_model_weights()
+
+ self.module.train()
+
+ # add empty cache after each compute
+ torch.cuda.empty_cache()
+
+ def preprocess_data(self, data: DataProto) -> DataProto:
+ # prompts are identical for each training tp. We select for each inference tp
+ micro_dp_size = get_micro_data_parallel_world_size()
+ micro_dp_rank = get_micro_data_parallel_rank()
+
+ # broadcast from tp=0 to other tp ranks
+ broadcast_dict_tensor(data.batch,
+ src=mpu.get_tensor_model_parallel_src_rank(),
+ group=mpu.get_tensor_model_parallel_group())
+
+ if micro_dp_size > 1:
+ local_prompts = data.chunk(chunks=micro_dp_size)
+ data = local_prompts[micro_dp_rank]
+
+ return data
+
+ def postprocess_data(self, data: DataProto) -> DataProto:
+ meta_info = data.meta_info
+ # all gather batch among micro-dp groups
+ micro_dp_size = get_micro_data_parallel_world_size()
+ if micro_dp_size > 1:
+ data.batch = allgather_dict_tensors(data.batch.contiguous(),
+ size=get_micro_data_parallel_world_size(),
+ group=get_micro_data_parallel_group(),
+ dim=0)
+
+ # all gather batch among pp group
+ if meta_info.get('allgather_pp_output', True):
+ data.batch = allgather_dict_tensors(data.batch.contiguous(),
+ size=mpu.get_pipeline_model_parallel_world_size(),
+ group=mpu.get_pipeline_model_parallel_group(),
+ dim=0)
+ return data
+
+
+"""
+Micro Data parallel group
+"""
+
+
+def get_micro_data_parallel_group():
+ assert _MICRO_DATA_PARALLEL_GROUP is not None
+ return _MICRO_DATA_PARALLEL_GROUP
+
+
+def get_micro_data_parallel_world_size():
+ return torch.distributed.get_world_size(group=get_micro_data_parallel_group())
+
+
+def get_micro_data_parallel_rank():
+ return torch.distributed.get_rank(group=get_micro_data_parallel_group())
diff --git a/code/RL_model/verl/verl_train/.git-blame-ignore-revs b/code/RL_model/verl/verl_train/.git-blame-ignore-revs
new file mode 100644
index 0000000000000000000000000000000000000000..649ba3ca862e8e47a92b932b337fe189fbd14e7c
--- /dev/null
+++ b/code/RL_model/verl/verl_train/.git-blame-ignore-revs
@@ -0,0 +1,13 @@
+# Local uasge: git config blame.ignoreRevsFile .git-blame-ignore-revs
+
+# [dev] feat: immigrate from yapf & pylint to ruff based on pre-commit
+# Changed 268 files, +10k/-9k lines. This is the biggest formatter change.
+b00f77d8559b48d57a33c0132a5ba1c81891a536
+
+# [ci] refactor: reduce ruff line-length from 300 to 120
+# Changed 238 files, +6k/-1k lines. Global formatting change.
+00a10a8ef389556f957a2f36132b2358fd6a109f
+
+# [Lint] fix: linting errors in all files
+# Changed 179 files, +1k/-3k lines. Global lint fix.
+8e5ad4688a13de81727c014a3c2e2fb26324bc20
diff --git a/code/RL_model/verl/verl_train/.gitignore b/code/RL_model/verl/verl_train/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..62d4dcfc815ec735f6acd244457bd0708ff62a2e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/.gitignore
@@ -0,0 +1,130 @@
+**/*.pt
+**/checkpoints
+**/wget-log
+**/_build/
+**/*.ckpt
+**/outputs
+**/*.tar.gz
+**/playground
+**/wandb
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+dataset/*
+tensorflow/my_graph/*
+.idea/
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+# env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+tmp/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+pytest.ini
+output.txt
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+.venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# vscode
+.vscode
+
+# Mac
+.DS_Store
+
+# vim
+*.swp
+
+# emacs
+*~
+
+# ckpt
+*.lock
+
+# data
+*.parquet
+
+
+# local logs
+logs
+log
+outputs
+.history
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/.gitmodules b/code/RL_model/verl/verl_train/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..d5dd7a6aa577ccb64650ca389b699e04fd7af259
--- /dev/null
+++ b/code/RL_model/verl/verl_train/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "recipe"]
+ path = recipe
+ url = https://github.com/verl-project/verl-recipe.git
diff --git a/code/RL_model/verl/verl_train/.pre-commit-config.yaml b/code/RL_model/verl/verl_train/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ef606f8dc4e141430fa46a938ae11831960e8b7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/.pre-commit-config.yaml
@@ -0,0 +1,45 @@
+repos:
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: "v0.12.2"
+ hooks:
+ - id: ruff
+ args: ["--fix", "--show-fixes", "--output-format=full"]
+ exclude: ^.*\.(ipynb)$
+ - id: ruff-format
+
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: "v1.17.0"
+ hooks:
+ - id: mypy
+
+ - repo: local
+ hooks:
+ - id: autogen-trainer-cfg
+ name: Generate and verify verl/trainer/config/_generated_*.yaml
+ entry: scripts/generate_trainer_config.sh
+ language: script
+ pass_filenames: false
+
+ - repo: local
+ hooks:
+ - id: check-docstrings
+ name: Check doc string coverage
+ entry: python3 tests/special_sanity/check_docstrings.py
+ language: python
+ pass_filenames: false
+
+ - repo: local
+ hooks:
+ - id: check-license
+ name: Check license
+ entry: python3 tests/special_sanity/check_license.py --directories examples scripts tests verl setup.py
+ language: python
+ pass_filenames: false
+
+ - repo: local
+ hooks:
+ - id: compileall
+ name: Compile all python files
+ entry: sh -c 'PYTHONWARNINGS=error python3 -m compileall -q .'
+ language: python
+ pass_filenames: false
diff --git a/code/RL_model/verl/verl_train/.readthedocs.yaml b/code/RL_model/verl/verl_train/.readthedocs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0016868541a2a0667ef40ae6a9d861bcd26b9316
--- /dev/null
+++ b/code/RL_model/verl/verl_train/.readthedocs.yaml
@@ -0,0 +1,19 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.11"
+ rust: "1.70"
+
+sphinx:
+ configuration: docs/conf.py
+
+python:
+ install:
+ - requirements: docs/requirements-docs.txt
+ - method: pip
+ path: .
diff --git a/code/RL_model/verl/verl_train/CONTRIBUTING.md b/code/RL_model/verl/verl_train/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fd3023a0859f533951476fac6e8e06fe1e8aa3f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/CONTRIBUTING.md
@@ -0,0 +1,90 @@
+# Contributing to verl
+
+Thank you for considering a contribution to verl! We welcome contributions of any kind - bug fixes, enhancements, documentation improvements, or even just feedback. Whether you're an experienced developer or this is your first open-source project, your help is invaluable.
+
+Your support can take many forms:
+- Report issues or unexpected behaviors.
+- Suggest or implement new features.
+- Improve or expand documentation.
+- Review pull requests and assist other contributors.
+- Spread the word: share verl in blog posts, social media, or give the repo a ⭐.
+
+## Finding Issues to Contribute
+
+Looking for ways to dive in? Check out these issues:
+- [Good first issues](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
+- [Call for contribution](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22call%20for%20contribution%22)
+Furthermore, you can learn the development plan and roadmap via [RFC](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3ARFC) and [Roadmap](https://github.com/volcengine/verl/issues?q=state%3Aopen%20label%3A%22roadmap%22).
+
+
+## Developing
+
+- **Python-only**: install verl via `pip install -e .[test,vllm]` or `pip install -e .[test,sglang]` and iterate quickly. For full dependency setup, check out the verl [installation doc](https://verl.readthedocs.io/en/latest/start/install.html).
+
+## Code Linting and Formatting
+
+We rely on pre-commit to keep our code consistent. To set it up:
+
+```bash
+pip install pre-commit
+pre-commit install
+# for staged changes
+pre-commit run
+# for all files in the repo
+pre-commit run --all-files
+# run a specific hook with pre-commit
+# pre-commit run --all-files --show-diff-on-failure --color=always
+pre-commit run --all-files --show-diff-on-failure --color=always ruff
+pre-commit run --all-files --show-diff-on-failure --color=always autogen-trainer-cfg
+```
+
+## Testing
+
+Our test suites run on GitHub Actions. Check these workflows for details:
+- [GPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/gpu_unit_tests.yml)
+- [CPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/cpu_unit_tests.yml)
+- [vLLM tests](https://github.com/volcengine/verl/blob/main/.github/workflows/vllm.yml)
+- [SGLang tests](https://github.com/volcengine/verl/blob/main/.github/workflows/sgl.yml)
+
+### Adding CI tests
+
+If possible, please add CI test(s) for your new feature:
+
+1. Find the most relevant workflow yml file, which usually corresponds to a `hydra` default config (e.g. `ppo_trainer`, `ppo_megatron_trainer`, `sft_trainer`, etc).
+2. Add related path patterns to the `paths` section if not already included.
+3. Minimize the workload of the test script(s) (see existing scripts for examples).
+
+## Building the Docs
+```
+# Ensure verl is on your PYTHONPATH, e.g.:
+pip install -e .[test]
+
+# Install documentation dependencies
+cd docs
+pip install -r requirements-docs.txt
+
+# Generate HTML docs
+make clean
+make html
+
+# Preview locally
+python -m http.server -d _build/html/
+```
+Open your browser at http://localhost:8000 to explore the docs.
+
+## Pull Requests & Code Reviews
+
+Thanks for submitting a PR! To streamline reviews:
+- Follow our Pull Request Template for title format and checklist.
+- Adhere to our pre-commit lint rules and ensure all checks pass.
+- Update docs for any user-facing changes.
+- Add or update tests in the CI workflows, or explain why tests aren't applicable.
+
+## License
+
+See the [LICENSE](https://github.com/volcengine/verl/blob/main/LICENSE) file for full details.
+
+## Thank You
+
+We appreciate your contributions to verl. Your efforts help make the project stronger and more user-friendly. Happy coding!
+
diff --git a/code/RL_model/verl/verl_train/LICENSE b/code/RL_model/verl/verl_train/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/code/RL_model/verl/verl_train/Notice.txt b/code/RL_model/verl/verl_train/Notice.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ade439da525ac3f82936e131a1ae386f43207fd8
--- /dev/null
+++ b/code/RL_model/verl/verl_train/Notice.txt
@@ -0,0 +1 @@
+Copyright 2023-2024 Bytedance Ltd. and/or its affiliates
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/README.md b/code/RL_model/verl/verl_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3cb450bc6efb0007bdf0e1e4aa6dca8c39e9751e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/README.md
@@ -0,0 +1,306 @@
+
+ 👋 Hi, everyone!
+ verl is a RL training library initiated by ByteDance Seed team and maintained by the verl community.
+
+
+
+
+
+
+

+[](https://github.com/volcengine/verl/stargazers)
+[](https://twitter.com/verl_project)
+

+

+[](https://verl.readthedocs.io/en/latest/)
+

+
+
+
+
+
+verl: Volcano Engine Reinforcement Learning for LLMs
+
+verl is a flexible, efficient and production-ready RL training library for large language models (LLMs).
+
+verl is the open-source version of **[HybridFlow: A Flexible and Efficient RLHF Framework](https://arxiv.org/abs/2409.19256v2)** paper.
+
+verl is flexible and easy to use with:
+
+- **Easy extension of diverse RL algorithms**: The hybrid-controller programming model enables flexible representation and efficient execution of complex post-training dataflows. Build RL dataflows such as GRPO, PPO in a few lines of code.
+
+- **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as FSDP, Megatron-LM, vLLM, SGLang, etc
+
+- **Flexible device mapping**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
+
+- Ready integration with popular HuggingFace models
+
+verl is fast with:
+
+- **State-of-the-art throughput**: SOTA LLM training and inference engine integrations and SOTA RL throughput.
+
+- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
+
+
+

+
+
+
+
+## News
+
+- [2026/01] verl has been migrated to the [verl-project](https://github.com/verl-project)
+- [2026/01] verl first meetup was successfully held in Shanghai on 01/10, hosted by Volcengine and NVIDIA, the slides has been uploaded to [verl-data](https://github.com/verl-project/verl-data).
+- [2026/01] The `recipe` directory has been migrated to a dedicated repository: [verl-recipe](https://github.com/verl-project/verl-recipe) and added as a submodule. See https://github.com/volcengine/verl/pull/4795. It can be used as it was after `git submodule update --init --recursive recipe`. Note that [`transfer_queue`](verl/experimental/transfer_queue), [`fully_async_policy`](verl/experimental/fully_async_policy), [`one_step_off_policy`](verl/experimental/one_step_off_policy) and [`vla`](verl/experimental/vla) are kept under [`verl/experimental`](verl/experimental) since they are planned to be merged into the main library. Use them through `verl.experimental.{module}`.
+- [2025/12] [Mind Lab](https://macaron.im/mindlab) successfully used [verl](https://github.com/volcengine/verl) and [Megatron-bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge) to train GRPO Lora for Trillion-parameter model on 64 H800 - See their [techblog](https://macaron.im/mindlab/research/building-trillion-parameter-reasoning-rl-with-10-gpus).
+- [2025/10] verl is presented in the [PyTorch Conference 2025](https://pytorch.org/event/pytorch-conference-2025/).
+- [2025/08] verl is presented in the [PyTorch Expert Exchange Webinar](https://www.youtube.com/watch?v=Vd79NmmqY3Q&t=2s). [Slides](https://github.com/eric-haibin-lin/verl-community/blob/main/slides/verl_talk_pytorch_2025_08.pdf) available.
+- [2025/07] The [ReTool](https://arxiv.org/pdf/2504.11536) recipe is fully open sourced. [Blog](https://www.notion.so/verl-reTool-recipe-Using-multi-round-conversations-and-code-sandboxing-to-improve-the-math-of-large-23a8b5b7feba80b386b2e5b5e3c1cde0)
+- [2025/07] The first verl meetup will be held at ICML Vancouver on July 16th! Please [join us](https://lu.ma/0ek2nyao) if you are at ICML! (onsite only)
+- [2025/06] verl with Megatron backend enables large MoE models such as [DeepSeek-671B and Qwen3-235B](https://verl.readthedocs.io/en/latest/perf/dpsk.html).
+- [2025/03] [DAPO](https://dapo-sia.github.io/) is the open-sourced SOTA RL algorithm that achieves 50 points on AIME 2024 based on the Qwen2.5-32B pre-trained model, surpassing the previous SOTA achieved by DeepSeek's GRPO (DeepSeek-R1-Zero-Qwen-32B). DAPO's training is fully powered by verl and the reproduction code is available in `recipe/dapo` now.
+ more...
+
+ - [2025/04] [Seed-Thinking-v1.5](https://github.com/ByteDance-Seed/Seed-Thinking-v1.5/blob/main/seed-thinking-v1.5.pdf) tech report is released! Trained with verl, Seed-Thinking-v1.5 achieves 86.7 on AIME 2024, 55.0 on Codeforces and 77.3 on GPQA, demonstrating excellent reasoning abilities in STEM and coding. Beyond reasoning tasks, the method demonstrates notable generalization across diverse domains.
+ - [2025/07] verl keynote at [AWS AI Hours Singapore](https://pages.awscloud.com/aws-ai-hours-sg.html#agenda) on 7/8, verl & verl-agent project updates at [Agent for SWE meetup](https://lu.ma/e498qhsi) by LF AI & Data Singapore on 7/11.
+ - [2025/06] verl team will provide latest project updates at [PyTorch Day China](https://www.lfasiallc.com/pytorch-day-china/) on June 7th. Meet our dev team in Beijing!
+ - [2025/04] [VAPO](https://arxiv.org/pdf/2504.05118) (value-based augmented PPO) paper covers our latest RL method for reasoning models. Trained from Qwen-32B-base model, VAPO achieves 60.4 on AIME 2024, outperforming DAPO-32B.
+ - [2025/05] [PF-PPO](https://arxiv.org/abs/2409.06957), accepted to ICML 2025, is now supported in verl! PF-PPO enhances policy learning efficiency and robustness by filtering potentially noisy reward signals and reusing high-quality experiences via a replay buffer.
+ - [2025/04] We will give a tutorial about latest post-training techniques and programming guide for verl at [ICLR 2025 Expo](https://iclr.cc/virtual/2025/calendar?filter_events=Expo+Talk+Panel&filter_rooms=), [SCI-FM workshop](https://open-foundation-model.github.io/) and [LMSys afterparty](https://lu.ma/d23nyynm). Talk materials available [here](https://github.com/eric-haibin-lin/verl-community/tree/main/iclr25).
+ - [2025/03] verl v0.3.0.post1 is released! See [release note](https://github.com/volcengine/verl/releases/) for details. It achieves [~1.4x speedup](https://tongyx361.github.io/blogs/posts/verl-intro/#/verl-flexible-and-efficient-rl-for-llms) compared to prev versions.
+ - [2025/05] verl will be presented at [A2M Shanghai](https://a2m.msup.com.cn/home/?aid=4488&city=shanghai) on 5/16 - 5/17.
+ - [2025/05] verl will be presented at [GOSIM x PyTorch Day 2025](https://paris2025.gosim.org/). See you in Paris!
+ - [2025/03] We introduced the programming model of verl at the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg) and [verl intro and updates](https://github.com/eric-haibin-lin/verl-community/blob/main/slides/verl-lmsys-meetup.pdf) at the [SGLang-LMSYS Org Meetup](https://lu.ma/ntjrr7ig) in Sunnyvale mid-March.
+ - [2025/03] We will present verl(HybridFlow) at EuroSys 2025. See you in Rotterdam!
+ - [2025/02] verl v0.2.0.post2 is released!
+ - [2025/02] We presented verl in the Bytedance/NVIDIA/Anyscale Ray Meetup. See you in San Jose!
+ - [2025/01] [Doubao-1.5-pro](https://team.doubao.com/zh/special/doubao_1_5_pro) is released with SOTA-level performance on LLM & VLM. The RL scaling preview model is trained using verl, reaching OpenAI O1-level performance on math benchmarks (70.0 pass@1 on AIME).
+ - [2024/12] verl is presented at Ray Forward 2024. Slides available here
+ - [2024/12] The team presented Post-training LLMs: From Algorithms to Infrastructure at NeurIPS 2024. Slides and video available.
+ - [2024/10] verl is presented at Ray Summit. Youtube video available.
+ - [2024/08] HybridFlow (verl) is accepted to EuroSys 2025.
+
+
+
+## Key Features
+
+- **FSDP**, **FSDP2** and **Megatron-LM** for training.
+- **vLLM**, **SGLang** and **HF Transformers** for rollout generation.
+- Compatible with Hugging Face Transformers and Modelscope Hub: [Qwen-3](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-8b.sh), Qwen-2.5, Llama3.1, Gemma2, DeepSeek-LLM, etc
+- Supervised fine-tuning.
+- Reinforcement learning with [PPO](examples/ppo_trainer/), [GRPO](examples/grpo_trainer/), [GSPO](https://github.com/verl-project/verl-recipe/tree/main/gspo/), [ReMax](examples/remax_trainer/), [REINFORCE++](https://verl.readthedocs.io/en/latest/examples/config.html#algorithm), [RLOO](examples/rloo_trainer/), [PRIME](https://github.com/verl-project/verl-recipe/tree/main/prime/), [DAPO](https://github.com/verl-project/verl-recipe/tree/main/dapo/), [DrGRPO](https://github.com/verl-project/verl-recipe/tree/main/drgrpo), [KL_Cov & Clip_Cov](https://github.com/verl-project/verl-recipe/tree/main/entropy) etc.
+ - Support model-based reward and function-based reward (verifiable reward) for math, [coding](https://github.com/volcengine/verl-recipe/tree/main/dapo), etc
+ - Support vision-language models (VLMs) and [multi-modal RL](examples/grpo_trainer/run_qwen2_5_vl-7b.sh) with Qwen2.5-vl, Kimi-VL
+ - [Multi-turn with tool calling](https://github.com/volcengine/verl/tree/main/examples/sglang_multiturn)
+- LLM alignment recipes such as [Self-play preference optimization (SPPO)](https://github.com/verl-project/verl-recipe/tree/main/sppo)
+- Flash attention 2, [sequence packing](examples/ppo_trainer/run_qwen2-7b_seq_balance.sh), [sequence parallelism](examples/ppo_trainer/run_deepseek7b_llm_sp2.sh) support via DeepSpeed Ulysses, [LoRA](examples/sft/gsm8k/run_qwen_05_peft.sh), [Liger-kernel](examples/sft/gsm8k/run_qwen_05_sp2_liger.sh).
+- Scales up to 671B models and hundreds of GPUs with [expert parallelism](https://github.com/volcengine/verl/pull/1467)
+- Multi-gpu [LoRA RL](https://verl.readthedocs.io/en/latest/advance/ppo_lora.html) support to save memory.
+- Experiment tracking with wandb, swanlab, mlflow and tensorboard.
+- Hardware Support: Supports NVIDIA, AMD, [Ascend](https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/ascend_quick_start.rst)
+
+## Upcoming Features and Changes
+
+- Q3 Roadmap https://github.com/volcengine/verl/issues/2388
+- DeepSeek 671b optimizations with Megatron https://github.com/volcengine/verl/issues/1033
+- Multi-turn rollout and tools using optimizations https://github.com/volcengine/verl/issues/1882
+- [Agent integration](https://github.com/volcengine/verl/tree/main/verl/experimental/agent_loop)
+- Async and off-policy architecture https://github.com/volcengine/verl/pull/2231
+- List of breaking changes since v0.4 https://github.com/volcengine/verl/discussions/2270
+
+## Getting Started
+
+Documentation
+
+**Quickstart:**
+
+- [Installation](https://verl.readthedocs.io/en/latest/start/install.html)
+- [Quickstart](https://verl.readthedocs.io/en/latest/start/quickstart.html)
+- [Programming Guide](https://verl.readthedocs.io/en/latest/hybrid_flow.html) & [Tech Talk](https://hcqnc.xetlk.com/sl/3vACOK) (in Chinese)
+- [PPO in verl](https://verl.readthedocs.io/en/latest/algo/ppo.html)
+- [GRPO in verl](https://verl.readthedocs.io/en/latest/algo/grpo.html)
+
+**Running a PPO example step-by-step:**
+
+- [Prepare Data for Post-Training](https://verl.readthedocs.io/en/latest/preparation/prepare_data.html)
+- [Implement Reward Function for Dataset](https://verl.readthedocs.io/en/latest/preparation/reward_function.html)
+- [PPO Example Architecture](https://verl.readthedocs.io/en/latest/examples/ppo_code_architecture.html)
+- [Config Explanation](https://verl.readthedocs.io/en/latest/examples/config.html)
+
+**Reproducible algorithm baselines:**
+
+- [RL performance on coding, math](https://verl.readthedocs.io/en/latest/algo/baseline.html)
+
+**For code explanation and advance usage (extension):**
+
+- PPO Trainer and Workers
+
+ - [PPO Ray Trainer](https://verl.readthedocs.io/en/latest/workers/ray_trainer.html)
+ - [PyTorch FSDP Backend](https://verl.readthedocs.io/en/latest/workers/fsdp_workers.html)
+ - [Megatron-LM Backend](https://verl.readthedocs.io/en/latest/index.html)
+
+- Advanced Usage and Extension
+ - [Add Models with the FSDP Backend](https://verl.readthedocs.io/en/latest/advance/fsdp_extension.html)
+ - [Add Models with the Megatron-LM Backend](https://verl.readthedocs.io/en/latest/advance/megatron_extension.html)
+ - [Multi-turn Rollout Support](https://verl.readthedocs.io/en/latest/sglang_multiturn/multiturn.html)
+ - [Search Tool Integration](https://verl.readthedocs.io/en/latest/sglang_multiturn/search_tool_example.html)
+ - [Sandbox Fusion Integration](https://verl.readthedocs.io/en/latest/examples/sandbox_fusion_example.html)
+ - [Deployment using Separate GPU Resources](https://github.com/volcengine/verl/tree/main/examples/split_placement)
+ - [Extend to Other RL(HF) algorithms](https://verl.readthedocs.io/en/latest/advance/dpo_extension.html)
+ - [Ray API design tutorial](https://verl.readthedocs.io/en/latest/advance/placement.html)
+
+**Blogs from the community**
+
+- [When Reasoning Models Break Tokenization: The Hidden Complexity of Multiturn Training](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/fast_tokenization/multiturn_tokenization_and_masking.md)
+- [verl deployment on AWS SageMaker](https://medium.com/@kaige.yang0110/run-verl-on-sagemaker-using-4x8-l40s-gpus-8e6d5c3c61d3)
+- [verl x SGLang Multi-turn Code Walkthrough](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/code-walk-through/readme_EN.md)
+- [Optimizing SGLang Memory Usage in verl](https://hebiao064.github.io/rl-memory-management)
+- [SGLang, verl, OpenBMB and Tsinghua University: Pioneering End-to-End Multi-Turn RLHF](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/verl-multiturn-rollout-Release.md)
+- [Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration](https://rocm.blogs.amd.com/artificial-intelligence/verl-large-scale/README.html)
+- [veMLP x verl :玩转强化学习训练](https://mp.weixin.qq.com/s/7nbqxk4knMGd-hQE9ls2tA)
+- [使用 verl 进行 GRPO 分布式强化学习训练最佳实践](https://www.volcengine.com/docs/6459/1463942)
+- [HybridFlow verl 原文浅析](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/readme.md)
+- [最高提升 20 倍吞吐量!豆包大模型团队发布全新 RLHF 框架,现已开源!](https://team.doubao.com/en/blog/%E6%9C%80%E9%AB%98%E6%8F%90%E5%8D%8720%E5%80%8D%E5%90%9E%E5%90%90%E9%87%8F-%E8%B1%86%E5%8C%85%E5%A4%A7%E6%A8%A1%E5%9E%8B%E5%9B%A2%E9%98%9F%E5%8F%91%E5%B8%83%E5%85%A8%E6%96%B0-rlhf-%E6%A1%86%E6%9E%B6-%E7%8E%B0%E5%B7%B2%E5%BC%80%E6%BA%90)
+
+## Performance Tuning Guide
+
+The performance is essential for on-policy RL algorithm. We have written a detailed [performance tuning guide](https://verl.readthedocs.io/en/latest/perf/perf_tuning.html) to help you optimize performance.
+
+## Upgrade to vLLM >= v0.8.2
+
+verl now supports vLLM>=0.8.2 when using FSDP as the training backend. Please refer to [this document](https://github.com/volcengine/verl/blob/main/docs/README_vllm0.8.md) for the installation guide and more information. Please avoid vllm 0.7.x, which contains bugs that may lead to OOMs and unexpected errors.
+
+## Use Latest SGLang
+
+SGLang is fully supported with verl, and SGLang RL Group is working extensively on building unique features, including multi-turn agentic RL, VLM RLHF, server-based RL, and partial rollout. Please refer to [this document](https://verl.readthedocs.io/en/latest/workers/sglang_worker.html) for the installation guide and more information.
+
+## Upgrade to FSDP2
+
+verl is fully embracing FSDP2! FSDP2 is recommended by torch distributed team, providing better throughput and memory usage, and is composible with other features (e.g. torch.compile). To enable FSDP2, simply use verl main and set the following options:
+
+```
+actor_rollout_ref.ref.strategy=fsdp2
+actor_rollout_ref.actor.strategy=fsdp2
+critic.strategy=fsdp2
+reward_model.strategy=fsdp2
+```
+
+Furthermore, FSDP2 cpu offloading is compatible with gradient accumulation. You can turn it on to save memory with `actor_rollout_ref.actor.fsdp_config.offload_policy=True`. For more details, see https://github.com/volcengine/verl/pull/1026
+
+## AMD Support (ROCm Kernel)
+
+verl now supports FSDP as the training engine (Megatron support coming soon) and both integrates with vLLM and SGLang as inference engines. Please refer to [this document](https://github.com/volcengine/verl/blob/main/docs/amd_tutorial/amd_build_dockerfile_page.rst) for the installation guide and more information, and [this document](https://github.com/volcengine/verl/blob/main/docs/amd_tutorial/amd_vllm_page.rst) for the vLLM performance tuning for ROCm.
+
+## Citation and acknowledgement
+
+If you find the project helpful, please cite:
+
+- [HybridFlow: A Flexible and Efficient RLHF Framework](https://arxiv.org/abs/2409.19256v2)
+- [A Framework for Training Large Language Models for Code Generation via Proximal Policy Optimization](https://i.cs.hku.hk/~cwu/papers/gmsheng-NL2Code24.pdf)
+
+```bibtex
+@article{sheng2024hybridflow,
+ title = {HybridFlow: A Flexible and Efficient RLHF Framework},
+ author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
+ year = {2024},
+ journal = {arXiv preprint arXiv: 2409.19256}
+}
+```
+
+verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and contributed by Bytedance, Anyscale, LMSys.org, [Alibaba Qwen team](https://github.com/QwenLM/), Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, University of Hong Kong, ke.com, [All Hands AI](https://www.all-hands.dev/), [ModelBest](http://modelbest.cn/), JD AI Lab, Microsoft Research, [StepFun](https://www.stepfun.com/), Amazon, LinkedIn, Meituan, [Camel-AI](https://www.camel-ai.org/), [OpenManus](https://github.com/OpenManus), Xiaomi, NVIDIA research, [Baichuan](https://www.baichuan-ai.com/home), [RedNote](https://www.xiaohongshu.com/), [SwissAI](https://www.swiss-ai.org/), [Moonshot AI (Kimi)](https://www.moonshot-ai.com/), Baidu, Snowflake, Skywork.ai, JetBrains, [IceSword Lab](https://www.iceswordlab.com), and many more.
+
+## Awesome Projects Built with `verl`
+
+Welcome to register your awesome project build with `verl` for other developers' reference!
+
+- [TinyZero](https://github.com/Jiayi-Pan/TinyZero): a reproduction of **DeepSeek R1 Zero** recipe for reasoning tasks 
+- [SkyThought](https://github.com/NovaSky-AI/SkyThought): RL training for Sky-T1-7B by NovaSky AI team. 
+- [simpleRL-reason](https://github.com/hkust-nlp/simpleRL-reason): SimpleRL-Zoo: Investigating and Taming Zero Reinforcement Learning for Open Base Models in the Wild 
+- [Easy-R1](https://github.com/hiyouga/EasyR1): **Multi-modal** RL training framework 
+- [OpenManus-RL](https://github.com/OpenManus/OpenManus-RL): LLM Agents RL tuning framework for multiple agent environments. 
+- [rllm](https://github.com/agentica-project/rllm): async RL training with [verl-pipeline](https://github.com/agentica-project/verl-pipeline) 
+- [RAGEN](https://github.com/ZihanWang314/ragen): a general-purpose reasoning **agent** training framework 
+- [Search-R1](https://github.com/PeterGriffinJin/Search-R1): RL with reasoning and **searching (tool-call)** interleaved LLMs 
+- [ReSearch](https://github.com/Agent-RL/ReSearch): Learning to **Re**ason with **Search** for LLMs via Reinforcement Learning 
+- [Skywork-OR1](https://github.com/SkyworkAI/Skywork-OR1): Skywork open reaonser series 
+- [ToRL](https://github.com/GAIR-NLP/ToRL): Scaling tool-integrated RL 
+- [Absolute Zero Reasoner](https://github.com/LeapLabTHU/Absolute-Zero-Reasoner): [A no human curated data self-play framework for reasoning](https://arxiv.org/abs/2505.03335) 
+- [verl-agent](https://github.com/langfengQ/verl-agent): A scalable training framework for **long-horizon LLM/VLM agents**, along with a new algorithm **GiGPO** 
+- [RL-Factory](https://github.com/Simple-Efficient/RL-Factory): An easy and efficient RL post-training framework for Agentic Learning 
+- [ReTool](https://retool-rl.github.io/): ReTool: reinforcement learning for strategic tool use in LLMs. Code release is in progress...
+- [verl-tool](https://github.com/TIGER-AI-Lab/verl-tool): An unified and easy-to-extend tool-agent training framework based on verl
+- [PRIME](https://github.com/PRIME-RL/PRIME): Process reinforcement through implicit rewards 
+- [MemAgent](https://github.com/BytedTsinghua-SIA/MemAgent): MemAgent: Reshaping Long-Context LLM with Multi-Conv RL based Memory Agent 
+- [POLARIS](https://github.com/ChenxinAn-fdu/POLARIS): A Post-training recipe for scaling RL on Advanced Reasoning models 
+- [GUI-R1](https://github.com/ritzz-ai/GUI-R1): **GUI-R1**: A Generalist R1-style Vision-Language Action Model For **GUI Agents** 
+- [DeepRetrieval](https://github.com/pat-jj/DeepRetrieval): RL Training of **Search Agent** with **Search/Retrieval Outcome** 
+- [Code-R1](https://github.com/ganler/code-r1): Reproducing R1 for **Code** with Reliable Rewards 
+- [DeepResearcher](https://github.com/GAIR-NLP/DeepResearcher): Scaling deep research via reinforcement learning in real-world environments 
+- [VAGEN](https://github.com/RAGEN-AI/VAGEN): Training VLM agents with multi-turn reinforcement learning 
+- [RM-R1](https://arxiv.org/abs/2505.02387): RL training of reasoning reward models 
+- [LUFFY](https://arxiv.org/pdf/2504.14945): Learning to Reason under Off-Policy Guidance
+- [DeepMath](https://github.com/zwhe99/DeepMath): DeepMath-103K data and series models for math reasoning
+- [PACS](https://github.com/ritzz-ai/PACS): Implicit Actor Critic Coupling via a Supervised Learning Framework for RLVR 
+- [Entropy Mechanism of RL](https://github.com/PRIME-RL/Entropy-Mechanism-of-RL): The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning
+- [LLaSA-TTS-GRPO](https://github.com/channel-io/ch-tts-llasa-rl-grpo): TTS fine-tuning with GRPO optimization based on LLASA models 
+- [PF-PPO](https://arxiv.org/abs/2409.06957): Policy Filtration for PPO based on the reliability of reward signals for more efficient and robust RLHF.
+- [RACRO](https://github.com/gyhdog99/RACRO2): Build multi-modal reasoning models via decoupling it into query-conditioned captioning and text-only reasoning 
+- [Agent Lightning](https://github.com/microsoft/agent-lightning): A flexible and extensible framework that enables seamless agent optimization for any existing agent framework. 
+- [VTool-R1](https://github.com/VTOOL-R1/vtool-r1): VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. 
+- [Kimina-Prover-RL](https://github.com/project-numina/kimina-prover-rl/tree/main/recipe/kimina_prover_rl): Training pipeline for formal theorem proving, based on a paradigm inspired by DeepSeek-R1.
+- [RL-PLUS](https://github.com/YihongDong/RL-PLUS): Countering Capability Boundary Collapse of LLMs in Reinforcement Learning with Hybrid-policy Optimization.
+- [rStar2-Agent](https://github.com/microsoft/rStar): Using reinforcement learning with multi-step tool-calling for math tasks, rStar2-Agent-14B reaches frontier-level math reasoning in just 510 RL training steps 
+- [Vision-SR1](https://github.com/zli12321/Vision-SR1): Self-Rewarding Vision-Language Model via Reasoning Decomposition 
+- [SimpleVLA-RL](https://github.com/PRIME-RL/SimpleVLA-RL): SimpleVLA-RL: A Simple yet Effective Vision-Language Action Model for Reinforcement Learning 
+- [Table-R1](https://github.com/Table-R1/Table-R1): Table-R1: Inference-Time Scaling for Table Reasoning 
+- [Revisual-R1](https://github.com/CSfufu/Revisual-R1): Revisual-R1: Advancing Multimodal Reasoning From Optimized Cold Start to Staged Reinforcement Learning 
+- [ARES](https://github.com/shawn0728/ARES): ARES: Multimodal Adaptive Reasoning via Difficulty-Aware Token-Level Entropy Shaping 
+- [Meta-Bandit-LLM](https://github.com/sanxing-chen/meta-bandit-llm): Meta-Bandit-LLM: Long-horizon multiturn interactive training for meta-bandit agents 
+- [PokeeResearch](https://github.com/Pokee-AI/PokeeResearchOSS): PokeeResearch: State-of-the-art 7B DeepResearch Agent that leverages web search and content reading capabilities to answer complex questions using the most up-to-date information available online. 
+- [Search Self-play](https://github.com/Alibaba-Quark/SSP): Pushing the Frontier of Agent Capability without Supervision 
+- [OneThinker](https://github.com/tulerfeng/OneThinker): All-in-one Reasoning Model for Image and Video 
+- [OpenTinker](https://github.com/open-tinker/OpenTinker): Democratizing Agentic Reinforcement Learning as a Service 
+- [FlowRL](https://github.com/Xuekai-Zhu/FlowRL): Matching reward distributions via **flow balance** for diverse exploration and generalizable reasoning 
+- [Logic-RL](https://github.com/Unakar/Logic-RL): a reproduction of DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset. 
+- [Seed-Coder](https://github.com/ByteDance-Seed/Seed-Coder): RL training of Seed-Coder boosts performance on competitive programming 
+- [all-hands/openhands-lm-32b-v0.1](https://www.all-hands.dev/blog/introducing-openhands-lm-32b----a-strong-open-coding-agent-model): A strong, open coding agent model, trained with [multi-turn fine-tuning](https://github.com/volcengine/verl/pull/195)
+- [s3](https://github.com/pat-jj/s3) **Efficient Yet Effective** Search Agent Training via RL 
+- [Rec-R1](https://arxiv.org/pdf/2503.24289): Bridging Generative Large Language Models and Recommendation Systems via Reinforcement Learning
+- [Explore RL Data Scaling](https://arxiv.org/abs/2503.22230): Exploring Data Scaling Trends and Effects in Reinforcement Learning from Human Feedback
+- [FIRE](https://arxiv.org/abs/2410.21236): Flaming-hot initiation with regular execution sampling for large language models
+- [DQO](https://arxiv.org/abs/2410.09302): Enhancing multi-Step reasoning abilities of language models through direct Q-function optimization
+- [ProRL](https://arxiv.org/abs/2505.24864): Prolonged Reinforcement Learning Expands Reasoning Boundaries in Large Language Models
+- [cognition-engineering](https://github.com/gair-nlp/cognition-engineering): Test time scaling drives cognition engineering. 
+- [Trust Region Preference Approximation](https://github.com/XueruiSu/Trust-Region-Preference-Approximation): A simple and stable **reinforcement learning algorithm** for LLM reasoning. 
+- [AdaRFT](https://github.com/uscnlp-lime/verl): Efficient Reinforcement Finetuning via **Adaptive Curriculum Learning** 
+- [critic-rl](https://github.com/HKUNLP/critic-rl): LLM critics for code generation 
+- [self-rewarding-reasoning-LLM](https://arxiv.org/pdf/2502.19613): self-rewarding and correction with **generative reward models** 
+- [DeepEnlighten](https://github.com/DolbyUUU/DeepEnlighten): Reproduce R1 with **social reasoning** tasks and analyze key findings 
+- [MetaSpatial](https://github.com/PzySeere/MetaSpatial): Reinforcing **3D Spatial Reasoning** in **VLMs** for the **Metaverse** 
+- [PURE](https://github.com/CJReinforce/PURE): **Credit assignment** is the key to successful reinforcement fine-tuning using **process reward model** 
+- [cognitive-behaviors](https://github.com/kanishkg/cognitive-behaviors): Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs 
+- [deepscaler](https://github.com/agentica-project/rllm/tree/deepscaler): iterative context scaling with GRPO 
+- [DAPO](https://dapo-sia.github.io/): the fully open source SOTA RL algorithm that beats DeepSeek-R1-zero-32B 
+- [NoisyRollout](https://github.com/NUS-TRAIL/NoisyRollout): Reinforcing Visual Reasoning with Data Augmentation 
+- [SPEAR](https://github.com/TencentYoutuResearch/SPEAR): **Self-imitation** with **Progressive Exploration** for Agentic Reinforcement Learning (ICLR 2026) 
+
+## Contribution Guide
+
+See [contributions guide](CONTRIBUTING.md)
+
+## About [ByteDance Seed Team](https://team.doubao.com/)
+
+Founded in 2023, ByteDance Seed Team is dedicated to crafting the industry's most advanced AI foundation models. The team aspires to become a world-class research team and make significant contributions to the advancement of science and society. You can get to know Bytedance Seed better through the following channels👇
+
+
+
+We are HIRING! Send us an [email](mailto:the.verl.project@gmail.com) if you are interested in internship/FTE opportunities in RL for agents.
diff --git a/code/RL_model/verl/verl_train/debug_reward_func.jsonl b/code/RL_model/verl/verl_train/debug_reward_func.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5632c825eeb76f767db45916020a010268993058
--- /dev/null
+++ b/code/RL_model/verl/verl_train/debug_reward_func.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:928843280b906de5de388ebfdfb87a5b54c0ec782ab8ed39e5768dc2b275b754
+size 4048915
diff --git a/code/RL_model/verl/verl_train/docs/Makefile b/code/RL_model/verl/verl_train/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8bda904a9b0b29dfcf538cb52b806dd910710a4a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+SPHINXPROJ = verl
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/code/RL_model/verl/verl_train/docs/README.md b/code/RL_model/verl/verl_train/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c5db04874138435ef986342a7b8be668b81d0b0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/README.md
@@ -0,0 +1,22 @@
+# verl documentations
+
+## Build the docs
+
+```bash
+# If you want to view auto-generated API docstring, please make sure verl is available in python path. For instance, install verl via:
+# pip install .. -e[test]
+
+# Install dependencies needed for building docs.
+pip install -r requirements-docs.txt
+
+# Build the docs.
+make clean
+make html
+```
+
+## Open the docs with your browser
+
+```bash
+python -m http.server -d _build/html/
+```
+Launch your browser and navigate to http://localhost:8000 to view the documentation. Alternatively you could drag the file `_build/html/index.html` to your local browser and view directly.
diff --git a/code/RL_model/verl/verl_train/docs/README_vllm0.7.md b/code/RL_model/verl/verl_train/docs/README_vllm0.7.md
new file mode 100644
index 0000000000000000000000000000000000000000..e84feddd7537b0cadb1157993a3819bfc5e52042
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/README_vllm0.7.md
@@ -0,0 +1,73 @@
+# Upgrading to vllm >= 0.7
+
+Note: verl+vllm 0.8.3 is now stable. Please see ``docs/README_vllm0.8.md`` for upgrade guide.
+
+## Installation
+
+Note: At time of writing, verl+vllm 0.7.x supports **FSDP** for training and **vLLM** for rollout.
+
+```
+# Create the conda environment
+conda create -n verl python==3.10
+conda activate verl
+
+# Install verl
+git clone https://github.com/volcengine/verl.git
+cd verl
+pip3 install -e .
+
+# Install the latest stable version of vLLM
+pip3 install vllm==0.7.3
+
+# Install flash-attn
+pip3 install flash-attn --no-build-isolation
+
+```
+
+Note that if you are installing lower versions of vLLM (0.7.0, 0.7.1, 0.7.2), you need to make some tiny patches manually on vllm (/path/to/site-packages/vllm after installation) after the above steps:
+
+- vllm/distributed/parallel_state.py: Remove the assertion below:
+
+```
+if (world_size
+ != tensor_model_parallel_size * pipeline_model_parallel_size):
+ raise RuntimeError(
+ f"world_size ({world_size}) is not equal to "
+ f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+ f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
+
+```
+
+- vllm/executor/uniproc_executor.py: change `local_rank = rank` to `local_rank = int(os.environ["LOCAL_RANK"])`
+- vllm/model_executor/model_loader/weight_utils.py: remove the `torch.cuda.empty_cache()` in `pt_weights_iterator`
+
+## Features
+
+### Use cuda graph
+
+After installation, examples using FSDP as training backends can be used. By default, the `enforce_eager` is set to True, which disables the cuda graph. To enjoy cuda graphs and the sleep mode of vLLM>=0.7, add the following lines to the bash script:
+
+```
+actor_rollout_ref.rollout.enforce_eager=False \
+actor_rollout_ref.rollout.free_cache_engine=True \
+
+```
+
+For a typical job like examples/ppo_trainer/run_qwen2-7b_seq_balance.sh, the rollout generation time is 85 seconds with vLLM0.7.0. By enabling the cudagraph, the generation duration is further reduced to 62 seconds.
+
+**Note:** Currently, if the `n` is greater than 1 in `SamplingParams` in vLLM>=0.7, there is a potential performance issue on the stability of rollout generation time (Some iterations would see generation time bursts) using vLLM's V0 Engine.
+
+### Use vLLM V1 Engine
+
+Using the vLLM V1 engine can avoid instability issues and achieve additional performance improvements. To use the V1 engine, you can first uninstall the previously installed vLLM and then follow the steps below to install the newer version.
+
+```
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout 2275784
+sed -i "903a\ data_parallel_size = world_size // pipeline_model_parallel_size // tensor_model_parallel_size" ./vllm/distributed/parallel_state.py
+VLLM_USE_PRECOMPILED=1 pip install --editable .
+```
+
+Then you can enable the V1 engine by setting `export VLLM_USE_V1=1`. In some benchmark tests, the V1 engine demonstrates a 1.5x speed improvement over the vLLM V0 engine.
+The stable support of the vLLM V1 engine is available on verl main.
diff --git a/code/RL_model/verl/verl_train/docs/README_vllm0.8.md b/code/RL_model/verl/verl_train/docs/README_vllm0.8.md
new file mode 100644
index 0000000000000000000000000000000000000000..d4f509f19f780a4e8b3edec6bb256d2aa964639a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/README_vllm0.8.md
@@ -0,0 +1,52 @@
+# Upgrading to vLLM >= 0.8
+
+Last updated: 05/04/2025.
+
+## Installation
+
+Note: This version of verl+vLLM 0.8+ supports **FSDP** for training and **vLLM** for rollout.
+
+```bash
+# Create the conda environment
+conda create -n verl python==3.10
+conda activate verl
+
+# Install verl
+git clone https://github.com/volcengine/verl.git
+cd verl
+pip3 install -e .
+
+# Install the latest stable version of vLLM
+pip3 install vllm==0.8.3
+
+# Install flash-attn
+pip3 install flash-attn --no-build-isolation
+
+```
+
+We have a pre-built docker image for verl+vLLM 0.8.3. You can direct import it with the following command:
+
+```bash
+docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
+```
+
+## Features
+
+vLLM 0.8+ supports cuda graph and V1 engine by default in verl. To enable these features, remember to add the following lines to the bash script:
+
+```bash
+actor_rollout_ref.rollout.enforce_eager=False \
+actor_rollout_ref.rollout.free_cache_engine=True \
+```
+
+and also **remove** the environment variable if it exists:
+
+## Notes
+
+When you just directly upgrade vllm>=0.8, some dependency packages may undergo version changes. If you encounter the following problems:
+
+```bash
+in from torch.multiprocessing.reductions import ForkingPickler ImportError: cannot import name 'ForkingPickler' from 'torch.multiprocessing.reductions' (/opt/conda/lib/python3.11/site-packages/torch/multiprocessing/reductions.py)
+```
+
+You need to upgrade `tensordict` to version 0.6.2 using the command `pip install tensordict==0.6.2`.
diff --git a/code/RL_model/verl/verl_train/docs/_static/custom.css b/code/RL_model/verl/verl_train/docs/_static/custom.css
new file mode 100644
index 0000000000000000000000000000000000000000..32f08475754bc280bca407d1643ec3aa68eeacf3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/_static/custom.css
@@ -0,0 +1,217 @@
+/* Make the documentation use full screen width */
+.wy-nav-content {
+ max-width: none !important;
+ width: 100% !important;
+ padding: 1.618em 3.236em !important;
+}
+
+/* Adjust the content wrapper - will be set by JavaScript */
+.wy-nav-content-wrap {
+ margin-left: 300px;
+ transition: margin-left 0.2s ease;
+ width: auto !important;
+ position: relative !important;
+ background: white !important;
+ min-height: 100vh !important;
+}
+
+/* Make the main content area responsive */
+.rst-content {
+ max-width: none !important;
+ width: 100% !important;
+}
+
+/* Optional: Adjust table widths to prevent overflow */
+.rst-content table.docutils {
+ width: 100% !important;
+ table-layout: auto !important;
+}
+
+/* Optional: Better code block width handling */
+.rst-content .highlight {
+ width: 100% !important;
+}
+
+/* Content area positioning already handled above */
+
+/* Optional: Improve readability with some margin on very wide screens */
+@media (min-width: 1400px) {
+ .wy-nav-content {
+ max-width: none !important;
+ margin: 0 auto !important;
+ }
+}
+
+/* Resizable sidebar styles */
+.wy-nav-side {
+ position: fixed !important;
+ top: 0 !important;
+ bottom: 0 !important;
+ left: 0 !important;
+ width: 300px;
+ min-width: 200px;
+ max-width: 600px;
+ display: flex;
+ flex-direction: column;
+ z-index: 200 !important;
+}
+
+/* Ensure sidebar header (logo, search) adapts to width */
+.wy-side-nav-search {
+ width: 100% !important;
+ box-sizing: border-box !important;
+ padding: 0.809em 0.809em !important;
+}
+
+.wy-side-nav-search input[type="text"] {
+ width: 100% !important;
+ box-sizing: border-box !important;
+}
+
+/* Make logo/title area responsive */
+.wy-side-nav-search > div.version {
+ width: 100% !important;
+}
+
+.wy-side-nav-search > a {
+ width: 100% !important;
+ display: block !important;
+ white-space: nowrap !important;
+ overflow: hidden !important;
+ text-overflow: ellipsis !important;
+}
+
+/* Responsive adjustments for narrow sidebar */
+@media (max-width: 300px) {
+ .wy-side-nav-search > a {
+ font-size: 0.9em !important;
+ }
+
+ .wy-side-nav-search input[type="text"] {
+ font-size: 0.8em !important;
+ }
+}
+
+/* Ensure search input doesn't overflow */
+.wy-side-nav-search form {
+ width: 100% !important;
+ margin: 0 !important;
+}
+
+/* Make search icon responsive */
+.wy-side-nav-search .wy-dropdown {
+ width: 100% !important;
+}
+
+/* Adjust search results dropdown width */
+.wy-side-nav-search .wy-dropdown-menu {
+ width: 100% !important;
+ max-width: none !important;
+ left: 0 !important;
+ right: 0 !important;
+}
+
+/* Resize handle is created by JavaScript */
+
+/* Make sure the sidebar content doesn't overflow */
+.wy-side-scroll {
+ width: 100% !important;
+ flex: 1 !important;
+ overflow-y: auto !important;
+ overflow-x: hidden !important;
+ padding-right: 10px !important;
+ box-sizing: border-box !important;
+ scroll-behavior: auto !important; /* Prevent smooth scrolling on sidebar itself */
+}
+
+/* Ensure proper scroll behavior for main content area */
+html {
+ scroll-behavior: smooth !important;
+}
+
+/* Ensure anchor links work properly in main content */
+.wy-nav-content-wrap {
+ scroll-behavior: smooth !important;
+}
+
+/* Fix scroll to target for anchor links */
+.rst-content {
+ scroll-behavior: smooth !important;
+}
+
+/* Fix anchor scroll offset to account for fixed header */
+.rst-content .section {
+ scroll-margin-top: 60px;
+}
+
+/* Fix anchor scroll offset for headers */
+.rst-content h1, .rst-content h2, .rst-content h3, .rst-content h4, .rst-content h5, .rst-content h6 {
+ scroll-margin-top: 60px;
+}
+
+/* Fix anchor scroll offset for specific scroll targets */
+.rst-content .headerlink {
+ scroll-margin-top: 60px;
+}
+
+/* Fix sidebar navigation styling */
+.wy-menu-vertical {
+ width: 100% !important;
+}
+
+.wy-menu-vertical li {
+ width: 100% !important;
+}
+
+.wy-menu-vertical a {
+ width: 100% !important;
+ word-wrap: break-word !important;
+ white-space: normal !important;
+}
+
+/* Content area margin is handled by JavaScript */
+
+/* Custom drag handle (more visible) */
+.resize-handle {
+ position: absolute;
+ top: 0;
+ right: 0;
+ width: 8px;
+ height: 100%;
+ background: #ccc;
+ cursor: col-resize;
+ z-index: 1001;
+ opacity: 0.3;
+ transition: opacity 0.2s ease;
+}
+
+.resize-handle:hover {
+ opacity: 0.8;
+ background: #999;
+}
+
+.resize-handle::before {
+ content: '';
+ position: absolute;
+ top: 50%;
+ left: 50%;
+ width: 2px;
+ height: 20px;
+ background: #666;
+ transform: translate(-50%, -50%);
+ border-radius: 1px;
+}
+
+.resize-handle:hover::before {
+ background: #333;
+}
+
+/* Ensure smooth resizing */
+.wy-nav-side.resizing {
+ user-select: none;
+ pointer-events: none;
+}
+
+.wy-nav-side.resizing .wy-side-scroll {
+ overflow: hidden;
+}
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js b/code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js
new file mode 100644
index 0000000000000000000000000000000000000000..2a51fa90043bb0ecf78149b092fd3447740fdaee
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js
@@ -0,0 +1,251 @@
+// Resizable sidebar functionality
+document.addEventListener('DOMContentLoaded', function() {
+ const sidebar = document.querySelector('.wy-nav-side');
+ const content = document.querySelector('.wy-nav-content-wrap');
+
+ if (!sidebar || !content) return;
+
+ // Create resize handle
+ const resizeHandle = document.createElement('div');
+ resizeHandle.className = 'resize-handle';
+ sidebar.appendChild(resizeHandle);
+
+ let isResizing = false;
+ let startX = 0;
+ let startWidth = 0;
+
+ // Get initial width
+ const getInitialWidth = () => {
+ return 300; // Default width
+ };
+
+ // Save width to localStorage
+ const saveWidth = (width) => {
+ localStorage.setItem('sidebar-width', width);
+ };
+
+ // Load width from localStorage
+ const loadWidth = () => {
+ const savedWidth = localStorage.getItem('sidebar-width');
+ if (savedWidth) {
+ const width = parseInt(savedWidth, 10);
+ if (width >= 200 && width <= 600) {
+ return width;
+ }
+ }
+ return getInitialWidth();
+ };
+
+ // Apply width to sidebar and content
+ const applyWidth = (width) => {
+ // Update sidebar width
+ sidebar.style.width = width + 'px';
+
+ // Update content margin with !important to override any CSS
+ content.style.setProperty('margin-left', width + 'px', 'important');
+
+ // Also update any other content wrapper that might exist
+ const contentInner = document.querySelector('.wy-nav-content');
+ if (contentInner) {
+ contentInner.style.setProperty('margin-left', '0px', 'important');
+ }
+
+ // Force reflow and repaint
+ sidebar.offsetHeight;
+ content.offsetHeight;
+
+ // Trigger window resize event to notify other components
+ window.dispatchEvent(new Event('resize'));
+ };
+
+ // Initialize with saved width
+ const initialWidth = loadWidth();
+ applyWidth(initialWidth);
+
+ // Mouse down on resize handle
+ resizeHandle.addEventListener('mousedown', (e) => {
+ isResizing = true;
+ startX = e.clientX;
+ startWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
+
+ sidebar.classList.add('resizing');
+ document.body.style.cursor = 'col-resize';
+ document.body.style.userSelect = 'none';
+
+ // Add overlay to prevent iframe issues
+ const overlay = document.createElement('div');
+ overlay.style.cssText = `
+ position: fixed;
+ top: 0;
+ left: 0;
+ width: 100%;
+ height: 100%;
+ z-index: 9999;
+ cursor: col-resize;
+ `;
+ overlay.id = 'resize-overlay';
+ document.body.appendChild(overlay);
+
+ e.preventDefault();
+ });
+
+ // Mouse move
+ document.addEventListener('mousemove', (e) => {
+ if (!isResizing) return;
+
+ const width = startWidth + e.clientX - startX;
+ const clampedWidth = Math.max(200, Math.min(600, width));
+ applyWidth(clampedWidth);
+ });
+
+ // Mouse up
+ document.addEventListener('mouseup', () => {
+ if (!isResizing) return;
+
+ isResizing = false;
+ sidebar.classList.remove('resizing');
+ document.body.style.cursor = '';
+ document.body.style.userSelect = '';
+
+ // Remove overlay
+ const overlay = document.getElementById('resize-overlay');
+ if (overlay) {
+ overlay.remove();
+ }
+
+ // Save the current width
+ const currentWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
+ saveWidth(currentWidth);
+ });
+
+ // Handle window resize - removed to prevent infinite loop
+ // The sidebar width is fixed and managed by drag functionality, no need to recalculate on window resize
+
+ // Double-click to reset to default width
+ resizeHandle.addEventListener('dblclick', () => {
+ const defaultWidth = 300;
+ applyWidth(defaultWidth);
+ saveWidth(defaultWidth);
+ });
+});
+
+// Fix navigation issues - Using MutationObserver for reliable initialization
+document.addEventListener('DOMContentLoaded', function() {
+ let navigationFixed = false;
+
+ function setupNavigationFix() {
+ if (navigationFixed) return;
+
+ // Find all links in the sidebar
+ const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
+
+ // Only proceed if we have sidebar links
+ if (sidebarLinks.length === 0) return;
+
+ console.log('Setting up navigation fix...');
+
+ sidebarLinks.forEach(function(link) {
+ const href = link.getAttribute('href');
+
+ // Clone the link to remove all existing event listeners
+ const newLink = link.cloneNode(true);
+
+ // Add our own click handler
+ newLink.addEventListener('click', function(e) {
+ console.log('Link clicked:', href);
+
+ // If it's an anchor link within the same page
+ if (href && href.startsWith('#') && href !== '#') {
+ e.preventDefault();
+ e.stopPropagation();
+
+ const targetId = href.substring(1);
+ const targetElement = document.getElementById(targetId);
+
+ if (targetElement) {
+ // Calculate offset for fixed header
+ const headerHeight = 60;
+ const elementPosition = targetElement.getBoundingClientRect().top;
+ const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
+
+ window.scrollTo({
+ top: offsetPosition,
+ behavior: 'smooth'
+ });
+
+ // Update URL hash
+ if (history.pushState) {
+ history.pushState(null, null, '#' + targetId);
+ } else {
+ location.hash = '#' + targetId;
+ }
+ }
+ }
+ // For external links, navigate normally
+ else if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
+ console.log('Navigating to external link:', href);
+ window.location.href = href;
+ }
+ });
+
+ // Replace the old link with the new one
+ link.parentNode.replaceChild(newLink, link);
+ });
+
+ navigationFixed = true;
+
+ // Handle initial page load with hash
+ if (window.location.hash) {
+ // Use requestAnimationFrame for better timing
+ requestAnimationFrame(() => {
+ const targetId = window.location.hash.substring(1);
+ const targetElement = document.getElementById(targetId);
+ if (targetElement) {
+ const headerHeight = 60;
+ const elementPosition = targetElement.getBoundingClientRect().top;
+ const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
+
+ window.scrollTo({
+ top: offsetPosition,
+ behavior: 'smooth'
+ });
+ }
+ });
+ }
+ }
+
+ // Try to set up navigation fix immediately
+ setupNavigationFix();
+
+ // If it didn't work, use MutationObserver to watch for when sidebar links are added
+ if (!navigationFixed) {
+ const observer = new MutationObserver(function(mutations) {
+ mutations.forEach(function(mutation) {
+ if (mutation.type === 'childList' && mutation.addedNodes.length > 0) {
+ // Check if sidebar links were added
+ const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
+ if (sidebarLinks.length > 0) {
+ setupNavigationFix();
+ if (navigationFixed) {
+ observer.disconnect();
+ }
+ }
+ }
+ });
+ });
+
+ // Start observing the document for changes
+ observer.observe(document.body, {
+ childList: true,
+ subtree: true
+ });
+
+ // Fallback timeout in case MutationObserver doesn't work
+ setTimeout(function() {
+ if (!navigationFixed) {
+ setupNavigationFix();
+ }
+ observer.disconnect();
+ }, 5000);
+ }
+});
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js b/code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js
new file mode 100644
index 0000000000000000000000000000000000000000..bec345cacc5b943693e1bf1973a7a6d863b0d85e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js
@@ -0,0 +1,14 @@
+document.addEventListener("DOMContentLoaded", function () {
+ var script = document.createElement("script");
+ script.type = "module";
+ script.id = "runllm-widget-script";
+ script.src = "https://widget.runllm.com";
+ script.setAttribute("version", "stable");
+ script.setAttribute("crossorigin", "true");
+ script.setAttribute("runllm-keyboard-shortcut", "Mod+j");
+ script.setAttribute("runllm-name", "verl Chatbot");
+ script.setAttribute("runllm-position", "TOP_RIGHT");
+ script.setAttribute("runllm-assistant-id", "679");
+ script.async = true;
+ document.head.appendChild(script);
+ });
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/_static/logo.png b/code/RL_model/verl/verl_train/docs/_static/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..424f538ee96d0916efaf6a59dbec674e06e40148
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/_static/logo.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd27c16b2122527e513ea8884e0ad175f59c73af2ca1e10b1acaab38196a8638
+size 84701
diff --git a/code/RL_model/verl/verl_train/docs/advance/agent_loop.rst b/code/RL_model/verl/verl_train/docs/advance/agent_loop.rst
new file mode 100644
index 0000000000000000000000000000000000000000..013ec9ed887924138c92d3bf12d94dd035ad5301
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/agent_loop.rst
@@ -0,0 +1,238 @@
+Agent Loop
+==========
+
+Last updated: 07/17/2025.
+
+.. versionadded:: 0.4.2
+ [status: alpha]
+
+.. warning::
+ Agent Loop is ready for use, but the API may change in future releaes.
+
+Agent Loop is designed as general interface for multi-turn rollout and agentic reinforcement learning.
+
+**Design goal**:
+
+- Plugable user defined agent loop
+- Provide standard request generate api with different inference frameworks
+- Provide request level load balance between multiple inference servers
+
+**Non-goal**:
+
+- How tool is defined and how to call tool
+
+In high level overview, agent loop is given a prompt, run user defined loop: call LLM generate api, call tools, ...
+and return the final output. The final output is then calculated reward and used as trajectory for RL training.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_overview.svg?raw=true
+
+
+API Design
+----------
+
+``AgentLoopBase`` class is the abstraction of agent loop, and ``run`` method is the only interface that user need to implement.
+The run method, given prompt messages in format: [{"role": "user"}, {"content": "..."}], and additional sampling params,
+could do whatever user wants, such as
+
+- call LLM generate api
+- call tools: web search, database query, code sandbox, ...
+- environment interaction
+- reflection
+- ...
+
+.. code:: python
+
+ class AgentLoopBase(ABC):
+ @abstractmethod
+ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
+ """Run agent loop to interact with LLM server and environment.
+
+ Args:
+ sampling_params (Dict[str, Any]): LLM sampling params.
+ **kwargs: dataset fields from `verl.utils.dataset.RLHFDataset`.
+
+ Returns:
+ AgentLoopOutput: Agent loop output.
+ """
+ raise NotImplementedError
+
+After running user defined loop, run method should return ``AgentLoopOutput``, including prompt token ids,
+response token ids, and response mask.
+
+.. code:: python
+
+ class AgentLoopOutput(BaseModel):
+ """Agent loop output."""
+
+ prompt_ids: list[int]
+ """Prompt token ids."""
+ response_ids: list[int]
+ """Response token ids including LLM generated token, tool response token."""
+ response_mask: list[int]
+ """Response mask, 1 for LLM generated token, 0 for tool response token."""
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_output.svg?raw=true
+
+.. note:: AgentLoopOutput only output one trajectory for a given prompt, multiple trajectories output is still under discussion.
+
+Architecture Design
+-------------------
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_architecture.png?raw=true
+
+A single PPO step contain two phase: rollout and train. In rollout phase:
+
+1. PPOTrainer sample a batch from dataset and call ``AgentLoopManager.generate_sequences``.
+2. AgentLoopManager ``wake_up`` all async LLM server instances, which will sync weights between inference engine(vLLM/SGLang) and training engine(FSDP/Megatron-LM).
+3. AgentLoopManager split batch into chunks and send each chunk to ``AgentLoopWorker``.
+4. AgentLoopWorker receive chunk and for each prompt, spawn a user defined ``AgentLoopBase`` instance, run ``run`` coroutine until end and get ``AgentLoopOutput``.
+
+.. tip::
+ AgentLoopWorker schedules multiple coroutines concurrently. If number of AgentLoopWorker equals batch_size, then each worker is response for one prompt.
+
+In agent loop, when user need LLM generate response:
+
+5. Call ``AsyncLLMServerManager.generate`` with prompt_ids.
+6. AsyncLLMServerManager select a server instance with least request in first turn and send request to it. (In following turns, the request will be sent to the same server instance).
+7. AsyncLLMServer receive a request, issue ipc/rpc with model_runner, and generate response. (There's slight differences between vLLM and SGLang, see below).
+
+When all prompts in all AgentLoopWorker finish, AgentLoopManager gather results and return to PPOTrainer.
+
+8. AgentLoopManager ``sleep`` all server instances, which will free kv cache and offload weights to CPU memory.
+
+AsyncLLMServer
+~~~~~~~~~~~~~~
+
+AsyncLLMServer is the abstraction of LLM server with two types of generation api:
+
+- `OpenAI chat completion `_: generate response for the given chat conversation.
+- Token in token out: generate response ids for the given token ids.
+
+We have officially supported vLLM and SGLang AsyncLLMServer, both of them implement the two api and are well tested.
+Other inference engine should be easy to plug-in by implement the ``AsyncServerBase`` class.
+
+.. code:: python
+
+ class AsyncServerBase(ABC):
+ @abstractmethod
+ async def chat_completion(self, raw_request: Request) -> JSONResponse:
+ """OpenAI chat completion API.
+
+ Args:
+ raw_request (Request): raw json request
+
+ Returns:
+ JSONResponse: json response
+
+ API reference: https://platform.openai.com/docs/api-reference/chat/create
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]:
+ """Generate response ids given prompt ids.
+
+ Args:
+ prompt_ids (List[int]): prompt ids
+ sampling_params (Dict[str, Any]): sampling params
+ request_id (str): request id
+
+ Returns:
+ List[int]: response ids
+ """
+ raise NotImplementedError
+
+
+Chat completion vs Token in token out
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. warning::
+ The following conclusion is based on our recent experience and is still open to investigation and discussion.
+
+Almost all agent frameworks (LangGraph, CrewAI, LlamaIndex, etc) call LLM with OpenAI chat completion api, and
+keep chat history as messages. So user may expect that we should use the chat completion api in multi-turn rollout.
+
+But based on our recent experience on single-turn training on DAPO and multi-turn training on `retool `_,
+we found the token_ids from apply the final messages may not equal to the token_ids by concat prompt_ids and response_ids in each turn.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/multi_turn.png?raw=true
+
+**Where does this inconsistency happened?**
+
+First, the tool parser may alter the content. For example
+
+.. code:: json
+
+ {"role": "assistant", "content": "Let me call a ... and get the result"}
+
+After tool_calls extraction, the messages is like this:
+
+.. code:: json
+
+ {"role": "assistant", "content": "Let me call a and get the result", "tool_calls": [{"name": "foo", "arguments": "{}"}]}
+
+Encode the extracted message back is not equal to the original LLM generated response_ids.
+
+Second, the `decode-encode` may also lead to inconsistency: `Agent-R1 issue#30 `_.
+
+**What is the impact of this inconsistency?**
+
+This inconsistency is not a big problem for serving/agent system, but is critical to RL training.
+It causes the trajectory deviate from the policy model distribution. We have observed that apply_chat_template
+to the final chat history messages make PPO training not even converged in single-turn.
+
+vLLM
+^^^^
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_vllm.png?raw=true
+
+For vLLM, the Async LLM Engine is running in same process as the server, and ModelRunner is running in same process as FSDP/Megatron-LM workers.
+Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it directly call engine to generate response_ids.
+
+SGLang
+^^^^^^
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_sglang.png?raw=true
+
+For SGLang, the Async LLM Engine is running in same process as FSDP/Megatron-LM worker-0, and it spawn multiple subprocesses as ModelRunner.
+Also, Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it remote call the worker-0 and get response_ids.
+
+AsyncLLMServerManager
+~~~~~~~~~~~~~~~~~~~~~
+
+AsyncLLMServerManager serve as proxy to multiple AsyncLLMServer instances, provides:
+
+- load balance: select a server instance with least request in first turn and send request to it.
+- sticky session: bind request_id to server instance, so that the same request_id will be sent to the same server instance in following turns.
+
+AsyncLLMServerManager is passed to ``AgentLoopBase.__init__``, whenever user want to interact with LLM in agent loop,
+they can call ``AsyncLLMServerManager.generate`` to generate response_ids.
+
+.. code:: python
+
+ class AsyncLLMServerManager:
+ async def generate(
+ self,
+ request_id,
+ *,
+ prompt_ids: list[int],
+ sampling_params: dict[str, Any],
+ ) -> list[int]:
+ """Generate tokens from prompt ids.
+
+ Args:
+ request_id (str): request id for sticky session.
+ prompt_ids (List[int]): List of prompt token ids.
+ sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
+
+ Returns:
+ List[int]: List of generated token ids.
+ """
+ ...
+
+Next
+----
+
+- :doc:`Agentic RL Training<../start/agentic_rl>`: Quick start agentic RL training with gsm8k dataset.
+- `LangGraph MathExpression `_: Demonstrate how to use LangGraph to build agent loop.
+- `Retool `_: End-to-end retool paper reproduction using tool agent.
diff --git a/code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md b/code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md
new file mode 100644
index 0000000000000000000000000000000000000000..55b8d392206c94968d6ade5a29ce82eb8d267c8f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md
@@ -0,0 +1,242 @@
+# Recipe: Async On-Policy Knowledge Distillation Trainer
+
+**Authors:** Brilliant Hanabi, furunding
+
+**Last updated:** 2025-11-08
+
+## 1. Background
+
+On-policy knowledge distillation (KD) trains a student policy to imitate a stronger teacher using samples drawn from the student's current policy. For each on-policy rollout the teacher returns soft, top-k token distributions and the student is optimized with a token-wise sparse KL objective that focuses learning on the teacher's high-probability modes. Because training examples come from the student's own state distribution, KD reduces distributional mismatch relative to off-policy distillation or supervised fine-tuning (SFT), improving stability and sample efficiency. Compared with reinforcement learning, KD avoids high-variance reward-based optimization and complex reward design by providing dense, informative per-token targets, which typically yields faster convergence and simpler scaling. Recent empirical and implementation-focused writeups (e.g., [ThinkingMachines' blog on on-policy distillation](https://thinkingmachines.ai/blog/on-policy-distillation/)) also demonstrate that on-policy distillation can deliver high-quality behavior with substantially lower compute and data requirements than many alternative approaches.
+
+Built on verl’s Ray-based single-controller components, we initially assembled a strictly on-policy KD pipeline where rollout generation, teacher knowledge acquisition, and policy optimization ran in lockstep. In practice, this synchronous design proved highly inefficient: the three stages had to wait for one another, creating pipeline bubbles and underutilized GPUs. To address this, we extend the asynchronous schedulers introduced by the One-Step-Off Policy pipeline to overlap these phases. This overlap preserves the same distillation objective while trading some strict on-policy guarantees for substantial gains in end-to-end throughput and hardware utilization.
+
+## 2. Distillation Overview and Objective
+
+This recipe centers on on-policy knowledge distillation: the student policy learns from a stronger teacher on samples generated by the current policy (on-policy). For each input prompt, the student (actor) generates responses; the teacher provides top-k token distributions, and the student is trained to match them token-wise.
+
+Core components:
+
+1. Teacher signal: top-k log-probabilities and token indices per valid token position.
+2. Student objective: sparse, token-level KL divergence between student logits and teacher top-k distribution.
+
+Objective: encourage student probabilities $Q$ to cover teacher modes $P$ using token-wise $\mathrm{KL}(P\,\|\,Q)$ computed on the teacher's top-k support.
+
+## 3. Efficient System Design
+
+### 3.1 Schedulers (One-Step / Two-Step Off-Policy)
+
+The native (serial) on-policy distillation process is shown in the figure below.
+
+
+
+This recipe supports optional schedulers that overlap generation, teacher querying, and updates to improve throughput without changing the distillation objective.
+
+#### 3.1.1 One-Step-Off-Policy
+
+
+
+- Warm-up: 2 steps.
+- Overlap pattern: rollout while actor update; weight sync while teacher retrieving.
+- Timing keys: `sync_rollout_weights`, `wait_prev_gen`, `wait_prev_teacher`.
+
+#### 3.1.2 Two-Step-Off-Policy
+
+
+
+- Warm-up: 3 steps.
+- Overlap pattern: rollout, actor update while teacher retrieving; interleave weight sync.
+- Timing keys: `sync_rollout_weights`, `max(wait_prev_gen, wait_prev_prev_teacher)`.
+
+Tip: Use `two_step_off` when teacher takes much more time than sync; `one_step_off` for simpler overlapping.
+
+Practical details:
+
+- Inputs per batch: `teacher_topk_logps`, `teacher_topk_indices`, `attention_mask` (to select valid token positions).
+- Loss injection: last pipeline stage computes KL via a logits processor; earlier stages remain unchanged.
+- Optional dynamic micro-batching groups sequences by density to reduce padding overhead.
+
+The pipeline:
+
+1. Actor parameters are synchronized to a rollout worker group (nccl broadcast) with a little bit latency.
+2. Rollout workers (vLLM-backed) generate sequences asynchronously (`async_generate_sequences`).
+3. Teacher client service (ZeroMQ based) returns top-k log-probabilities + token indices for each sequence (batched micro-requests), enabling KL-based guidance.
+4. Megatron actor performs a KL divergence computation between student logits and teacher top-k distributions (custom TP-aware kernel in `megatron_kl_loss.py`).
+5. Scheduling strategies (`one_step_off_scheduler`, `two_step_off_scheduler`) can overlap phases (optional for throughput):
+
+### 3.2 Weights sync between actor and rollout
+
+We initially followed the weight synchronization path from the One-Step-Off-Policy recipe (Ray collective broadcast across all actor and rollout ranks, plus Megatron-side allgather of parameter shards). In practice this became the dominant bottleneck, so we made three changes:
+
+1. Batch-and-bulk load on the rollout side: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched load into the rollout engine. In our setup this reduced the weight-loading time by roughly 3×.
+2. Batch-and-bulk broadcast between the actor and rollout: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched broadcast between the actor and rollout workers.
+3. Replace allgather with gather-to-root in Megatron: parameter shards are gathered to actor rank 0 (rather than allgathered to everyone), and that root then serves as the single source for broadcasting to rollout ranks. On top of the previous change, 2 and 3 changes delivered an additional ~4× speedup in the synchronization phase.
+
+## 4. High-Level Data & Control Flow
+
+```
+Driver (TaskRunner)
+ ├─ Initialize Ray, tokenizer, datasets, worker groups
+ ├─ Build ResourcePoolManager (actor vs rollout GPU layouts)
+ ├─ Trainer.fit()
+ ├─ init_workers(): build actor + rollout groups, broadcast weight metadata, create nccl collective group
+ ├─ continuous_iterator(): epochs → batches
+ ├─ scheduler (see Section 6)
+ • _async_gen_next_batch(): optional weight sync + non-blocking rollout
+ • _async_get_teacher_knowledge(): submit teacher requests, store future
+ ├─ For each step:
+ • Sync rollout weights
+ • Retrieve (batch, gen_output, teacher_output) from futures
+ • Merge gen + teacher outputs → DataProto
+ • Compute metrics (response length stats, timing, throughput)
+ • Update actor (forward_backward_batch + KL loss + optimizer step)
+ • (Optional) save checkpoint
+```
+
+> Note: Schedulers are optional and explained later; the distillation objective is independent of how phases are overlapped.
+
+## 5. Key Components
+
+### 5.1 `OnPolicyDistillTrainer` (`ray_trainer.py`)
+- Creates `GenerationBatchFuture` objects holding rollout and (later) teacher futures.
+- Adds scheduling + teacher integration + modified metric emission (KL, timing, MFU).
+
+### 5.2 Actor Worker (Megatron)
+- `OnPolicyDistillActor.update_policy()` orchestrates micro-batch forward/backward.
+- KL Loss injection via `logits_processor` during forward on pipeline last stage.
+
+### 5.3 Rollout Worker (vLLM / SGLang)
+- Pure inference mode (`init_model` builds model; no optimizer).
+- `async_generate_sequences` returns a Ray future for overlapping.
+
+### 5.4 Teacher Service (`teacher/`)
+- Proxy + worker architecture (ZMQ REQ/REP) for batched top-k retrieval.
+- `TeacherClient.submit()` returns a `Future`; aggregator composes micro-batches.
+- Configurable temperature, max tokens, only-response mode.
+
+### 5.5 KL Loss (`megatron_kl_loss.py`)
+- Performs normalization & stable per-token probability construction across TP shards.
+- Gradient is (student_probs - teacher_sparse_probs) scaled by upstream grad.
+
+## 6. Configuration Highlights (`on_policy_distill_trainer.yaml`)
+
+| Section | Purpose | Notable Keys |
+|---------|---------|-------------|
+| actor_rollout_ref.teacher | Teacher server | server_ip, server_port, n_server_workers |
+| trainer | Global training control | total_epochs, save_freq, scheduler (one_step_off | two_step_off), n_gpus_per_node, nnodes |
+| rollout | Resource split for rollout | n_gpus_per_node, nnodes |
+
+**Remember to set `trainer.n_gpus_per_node`, `trainer.nnodes`, `rollout.n_gpus_per_node` and `rollout.nnodes` to allocate GPU resources.**
+
+### Dynamic Batch Size
+
+Enable by:
+
+```
+actor_rollout_ref.actor.use_dynamic_bsz=True
+actor_rollout_ref.actor.max_token_len=6000 # cap post-group token length
+```
+
+Improves utilization under variable sequence lengths.
+
+### Resource Guidelines
+
+- Actor pool: `trainer.nnodes * trainer.n_gpus_per_node` GPUs.
+- Rollout pool: `rollout.nnodes * rollout.n_gpus_per_node` GPUs.
+- Ensure teacher server capacity ≈ `n_server_workers` to avoid stalls (monitor `wait_prev_teacher`).
+
+## 7. Usage Examples
+
+### 7.1 Launch Teacher Server
+
+Before training process, you should have a teacher server to provide logp information.
+
+We provide a toy teacher server example with vLLM. It needs `telnet` to check proxy status, and `python` command to run. So if you have not installed `telnet`, you can just delete these code in `start_server.sh`. And some OS use `python3` rather than `python`, so you also need to modify it. Also you can change the port of teacher if you meet port conflict.
+
+There are 3 arguments can be set for vllm backend `--tp-size`, `--n-logprobs` and `--ckpt-path` in `start_server.sh` / `worker.py`. You should set before you start server.
+
+We also provide a toy multi-node teacher server. You can start the main node using `start_server.sh` and start the slave nodes using `join_server.sh`. Still remember to set args in `join_server.sh`, especially the `$PROXY_IP` and `$PROXY_BACKEND_PORT` of main node.
+
+When training, student will automatically use the teacher's topk (n-logprobs) to set its own topk argument at line 83 of `recipe/gkd/megatron_kl_loss.py`, so you don't need to set student's topk argument.
+
+```bash
+cd recipe/gkd/teacher
+bash start_server.sh
+# Exports ports and launches proxy + worker (default vLLM backend)
+```
+
+Verify with:
+
+```bash
+telnet localhost 15555
+```
+
+### 7.2 Minimal Local (Megatron + vLLM) Run
+
+```bash
+python3 -m recipe.gkd.main_gkd \
+ --config-path=recipe/gkd/config \
+ --config-name=on_policy_distill_trainer \
+ actor_rollout_ref.model.path=/path/to/MODEL \
+ data.train_files=/path/to/train.parquet \
+ trainer.total_epochs=2 \
+ trainer.n_gpus_per_node=4 rollout.n_gpus_per_node=2 \
+ actor_rollout_ref.teacher.server_ip=127.0.0.1 \
+ actor_rollout_ref.teacher.server_port=15555 \
+ trainer.scheduler=one_step_off
+```
+
+(Requires a running teacher server).
+
+### 7.3 Ray Job Submission (Distilled 16B Example)
+
+See `run_moonlight_dsv3_training.sh` for a full script including:
+
+- Dist ckpt path setup (`dist_checkpointing_path`)
+- Expert parallel sizing (EP / ETP)
+- Dynamic batch sizing
+- Two-step-off scheduling for deeper overlap.
+
+Submit (after adjusting paths):
+
+```bash
+bash recipe/gkd/run_moonlight_dsv3_training.sh
+```
+
+## 8. Metrics & Monitoring
+
+Emitted metrics include (prefixes may vary):
+
+- Timing: `timing/wait_prev_gen`, `timing/sync_rollout_weights`, `timing/get_teacher_knowledge`, `timing/update_actor`.
+- Sequence stats: `response_seq_len/*` (avg, max, min, counts).
+- Performance: `perf/mfu/actor`, `perf/max_memory_allocated_gb`, `perf/cpu_memory_used_gb`.
+- Distillation: `actor/kl_loss`, `actor/grad_norm`, `actor/lr`.
+
+Interpretation Tips:
+
+- High `wait_prev_teacher` → scale `n_server_workers` and allocate more teacher GPUs or reduce per-request batch size, or just use `two_step_off`.
+- High `wait_prev_gen` with uniform lengths → allocate more rollout GPUs.
+- High `sync_rollout_weights` → check NCCL env / network congestion and try to modify `actor_rollout_ref.rollout.update_weights_bucket_megabytes`.
+
+## 9. Extensibility Notes
+
+- Add new schedulers by following interface returning `(epoch, batch, gen_output, teacher_output, timing_dict)`.
+- Integrate different distillation signals (e.g., hidden states, intermediate reasoning tokens) by extending `teacher_utils.get_teacher_knowledge` and modifying `logits_processor`.
+
+## 10. Functional Support Summary
+
+| Category | Supported |
+|----------|-----------|
+| Train engine | Megatron |
+| Rollout engine | vLLM |
+| Distillation signal | Teacher top-k logprobs & indices |
+| Scheduling | one_step_off, two_step_off |
+
+## 11. Quick Checklist Before Running
+
+- Teacher server reachable (`telnet `).
+- `actor_rollout_ref.model.path` contains the correct Megatron/HF config artifacts.
+- `train_files` points to a parquet dataset compatible with this recipe's dataset loader.
+- NCCL environment vars set (see `config/runtime_env.yaml`).
+
+---
+Feel free to open issues or PRs to extend scheduler variants, add new distillation objectives, or broaden engine support, and more improvement.
diff --git a/code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst b/code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c068bd92115d38a86b4ba9414ae4c5e5a18a2218
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst
@@ -0,0 +1,119 @@
+.. _attention-implementation-override:
+
+Attention Implementation Override
+==================================
+
+Last updated: 10/31/2025.
+
+By default, VERL's FSDP workers use ``flash_attention_2`` as the attention implementation for improved performance.
+However, you can now override this setting to use different attention implementations based on your needs.
+
+Supported Attention Implementations
+-----------------------------------
+
+The following attention implementations are supported (subject to model and hardware compatibility):
+
+- ``flash_attention_2``: High-performance attention implementation (default)
+- ``eager``: Standard PyTorch attention implementation
+- ``sdpa``: Scaled Dot-Product Attention (PyTorch native)
+
+When to Override
+----------------
+
+You might want to override the attention implementation in the following scenarios:
+
+- **Debugging**: Use ``eager`` for easier debugging and better error messages
+- **Compatibility**: Some models or hardware configurations may not support ``flash_attention_2``
+- **Memory constraints**: Different implementations have different memory characteristics
+- **Performance tuning**: Testing different implementations for optimal performance
+
+Configuration Examples
+-----------------------
+
+PPO Training with Eager Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To override the attention implementation for the actor, rollout, and reference models:
+
+.. code:: bash
+
+ python3 ppo_trainer.py \
+ +actor_rollout_ref.model.override_config.attn_implementation=eager \
+ [other parameters...]
+
+PPO Training with SDPA Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bash
+
+ python3 ppo_trainer.py \
+ +actor_rollout_ref.model.override_config.attn_implementation=sdpa \
+ [other parameters...]
+
+Critic Model Override
+~~~~~~~~~~~~~~~~~~~~~
+
+For training configurations that include a critic model, you can also override its attention implementation:
+
+.. code:: bash
+
+ python3 ppo_trainer.py \
+ +actor_rollout_ref.model.override_config.attn_implementation=eager \
+ +critic.model.override_config.attn_implementation=eager \
+ [other parameters...]
+
+YAML Configuration
+~~~~~~~~~~~~~~~~~~
+
+You can also specify the attention implementation in your YAML configuration file:
+
+.. code:: yaml
+
+ actor_rollout_ref:
+ model:
+ override_config:
+ attn_implementation: eager
+ # other overrides...
+
+ critic: # if using a critic model
+ model:
+ override_config:
+ attn_implementation: eager
+ # other overrides...
+
+Important Notes
+---------------
+
+**Backward Compatibility**: If you don't specify ``attn_implementation`` in the override config,
+VERL will continue to use ``flash_attention_2`` by default, ensuring backward compatibility with existing configurations.
+
+**Model Support**: Not all models support all attention implementations. Ensure your model is compatible
+with the chosen attention implementation before training.
+
+**Performance Impact**: Different attention implementations have varying performance characteristics.
+``flash_attention_2`` typically offers the best performance, while ``eager`` provides better debugging capabilities.
+
+**Hardware Dependencies**: Some attention implementations (like ``flash_attention_2``) may require
+specific hardware or CUDA versions. If you encounter compatibility issues, try using ``eager`` or ``sdpa``.
+
+Troubleshooting
+---------------
+
+If you encounter errors when using a specific attention implementation:
+
+1. **Check model compatibility**: Verify that your model supports the chosen attention implementation
+2. **Try eager attention**: Use ``attn_implementation=eager`` as a fallback for debugging
+3. **Check hardware requirements**: Ensure your hardware supports the attention implementation
+4. **Review error messages**: Attention implementation errors often provide clear guidance on supported options
+
+Example Error Resolution
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you see an error like "flash_attention_2 is not supported", you can resolve it by switching to eager attention:
+
+.. code:: bash
+
+ # Instead of the default flash_attention_2
+ python3 ppo_trainer.py +actor_rollout_ref.model.override_config.attn_implementation=eager
+
+This override ensures your training can proceed while you investigate the flash attention compatibility issue.
diff --git a/code/RL_model/verl/verl_train/docs/advance/checkpoint.rst b/code/RL_model/verl/verl_train/docs/advance/checkpoint.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9782af951d9cf626cae6b603666d3adc3114dfdc
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/checkpoint.rst
@@ -0,0 +1,159 @@
+.. _checkpoint-page:
+
+Using Checkpoints to Support Fault Tolerance Training
+=====================================================
+
+Last updated: 06/25/2025.
+
+There could be training errors or machine failure during the whole RLHF training process,
+so it is recommended to enable checkpoints to minimize your loss.
+
+The API Interface has already been listed in :ref:`config-explain-page`,
+and we will not repeat them. But there are still some technique details
+we hope to clarify.
+
+.. note::
+
+ Notice that the ``checkpoint.contents`` field has no effect to FSDP checkpoint except ``hf_model``,
+ the other 3 fields are binded together to save and load. We recommend to include ``model``, ``optimizer`` and ``extra`` all.
+
+Checkpoint Saving Directory Structure
+-------------------------------------
+
+Commonly, we use the ``default_local_dir`` declared in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yml``
+to work as preffix when saving checkpoints, which is ``checkpoints/${trainer.project_name}/${trainer.experiment_name}``.
+
+So the inner checkpoint structure of **FSDP** is like:
+
+.. code::
+
+ checkpoints/${trainer.project_name}/${trainer.experiment_name}
+ ├── global_steps_${i}
+ │ ├── actor
+ │ │ ├── huggingface # default save config and tokenizer, save huggingface model if include ``hf_model`` in checkpoint.contents
+ │ │ └── fsdp_config.json # FSDP config file, including world_size and fsdp version
+ │ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
+ │ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
+ │ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
+ │ ├── critic
+ │ │ ├── huggingface
+ │ │ └── fsdp_config.json
+ │ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
+ │ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
+ │ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
+ └── latest_checkpointed_iteration.txt
+
+All model shards, optimizers and extra states are stored together, in a sharded and distributed way.
+
+While **Megatron** current checkpoint structure is:
+
+.. code::
+
+ checkpoints/${trainer.project_name}/${trainer.experiment_name}
+ ├── global_steps_${i}
+ │ ├── actor
+ │ │ ├── huggingface # default save config and tokenizer, save huggingface model if include ``hf_mode`` in checkpoint.contents
+ │ │ └── dist_ckpt # save sharded model/optimizer/rng_states, naming the same as Megatron
+ │ └── critic
+ │ │ ├── huggingface
+ │ │ └── dist_ckpt
+ └── latest_checkpointed_iteration.txt
+
+Convert FSDP and Megatron Checkpoints to HuggingFace Format Model
+-----------------------------------------------------------------
+
+We provide a tool to convert the FSDP and Megatron checkpoints to HuggingFace format model.
+The tool is located in ``verl/model_merger``. For older versions of verl that don't include fsdp_config.json in checkpoints, you can use the legacy model merger located at ``verl/scripts/legacy_model_merger.py``.
+
+The script supports two main sub-commands: `merge` (to convert and save checkpoints) and `test` (to validate merged checkpoints against a reference model).
+The arguments for the `merge` sub-command are as follows:
+
+.. code:: bash
+
+ usage: python -m verl.model_merger merge [-h] --backend {fsdp,megatron} [--local_dir LOCAL_DIR] [--tie-word-embedding] [--is-value-model] [--use_cpu_initialization] [--target_dir TARGET_DIR]
+ [--hf_upload_path HF_UPLOAD_PATH] [--private]
+
+ options:
+ -h, --help show this help message and exit
+ --backend {fsdp,megatron}
+ The backend of the model
+ --local_dir LOCAL_DIR
+ Path to the saved model checkpoints
+ --tie-word-embedding Whether to tie word embedding weights (currently only Megatron supported)
+ --is-value-model Whether the model is a value model (currently only Megatron supported)
+ --use_cpu_initialization
+ Whether to use CPU initialization for the model. This is useful for large models that cannot fit into GPU memory during initialization.
+ --target_dir TARGET_DIR
+ Directory to save the merged huggingface model
+ --hf_upload_path HF_UPLOAD_PATH
+ Hugging Face repository ID to upload the model
+ --private Whether to upload the model to a private Hugging Face repository
+
+Example usage for merging Megatron checkpoints:
+
+.. code:: bash
+
+ python -m verl.model_merger merge \
+ --backend megatron \
+ --tie-word-embedding \
+ --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
+ --target_dir /path/to/merged_hf_model
+
+Example usage for distributed merging Megatron checkpoints:
+
+.. code:: bash
+
+ torchrun --nproc_per_node 1 --nnodes 8 --node_rank ${RANK} -m verl.model_merger merge \
+ --backend megatron \
+ --tie-word-embedding \
+ --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
+ --target_dir /path/to/merged_hf_model
+
+Example usage for merging FSDP checkpoints:
+
+.. code:: bash
+
+ python -m verl.model_merger merge \
+ --backend fsdp \
+ --local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor \
+ --target_dir /path/to/merged_hf_model
+
+
+Megatron Merger details
+-----------------------
+
+Current implement of decoder layers uses ``nn.ModuleList`` to store the layers,
+and thus the model layers on every PP rank and VPP rank starts their index from 0.
+
+There are 3 ways to correct this behavior:
+
+1. Modify the decoder layer's state_dict, add ``offset`` to each layer's index, thus rewrite ``nn.ModuleList`` implementation.
+2. Modify the layer index when saving checkpoint and recover them when loading checkpoint.
+3. The Checkpoint merger do this work, calculate the actual ``offset`` from ``state_dict`` only, a little complex.
+
+Current implementation use solution 2.
+
+
+HuggingFace to Megatron DistCheckpoint details
+----------------------------------------------
+
+Through ``mbridge``, we can directly save the mcore model to huggingface format during training.
+No need to convert the model to Megatron dist-checkpoint format.
+
+Original Checkpoint Utils
+-------------------------
+
+Original Checkpoint Utils refer to original checkpoint implementation in ``verl/models/[model]/megatron/checkpoint_utils``.
+
+We only need ``[model]_loader.py`` in original checkpoint utils now, since we get rid of storing ``hf_model`` every time (which is not recommended for large model training, try only saving sharded models if you can).
+
+.. note::
+
+ Note that ``[model]_loader`` only support environments where **storage clusters are able to connect with every calculation nodes**.
+ Because it utilizes **sharded load way to minimize the loading checkpoint overhead**.
+ Every rank loads its own data from ``state_dict`` which can be accessed by all of them.
+ While there is also no need to broadcast among DP ranks, since the saved state_dict is only produced by DP rank 0.
+
+ For users who can **only place the huggingface model on one device**, we keep the original costly implementation in ``[model]_loader_deprecated``. In this implementation, rank 0 broadcast all weights to each tp and pp rank, and then dp rank 0 broadcast to all dp ranks. There may be at risks of OOM.
+
+ To use deprecated loader, change the import package of ``load_state_dict_to_megatron_llama``.
diff --git a/code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst b/code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ee9ac619dde1ebfe3390d0b409b92252cb4e4104
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst
@@ -0,0 +1,273 @@
+Extend to other RL(HF) algorithms
+=================================
+
+Last updated: 02/25/2025.
+
+We already implemented the complete training pipeline of the PPO
+algorithms. To extend to other algorithms, we analyze the high-level
+principle to use verl and provide a tutorial to implement the DPO
+algorithm. Users can follow the similar paradigm to extend to other RL algorithms.
+
+.. note:: **Key ideas**: Single process drives multi-process computation and data communication.
+
+Overall Approach
+----------------
+
+Step 1: Consider what multi-machine multi-GPU computations are needed
+for each model, such as ``generate_sequence`` , ``compute_log_prob`` and
+``update_policy`` in the actor_rollout model. Implement distributed
+single-process-multiple-data (SPMD) computation and encapsulate them
+into APIs
+
+Step 2: Based on different distributed scenarios, including FSDP and 3D
+parallelism in Megatron-LM, implement single-process control of data
+interaction among multi-process computations.
+
+Step 3: Utilize the encapsulated APIs to implement the control flow
+
+Example: Online DPO
+-------------------
+
+We use verl to implement a simple online DPO algorithm. The algorithm
+flow of Online DPO is as follows:
+
+1. There is a prompt (rollout) generator which has the same weight as
+ the actor model. After a batch of prompts are fed into the generator,
+ it generates N responses for each prompt.
+2. Send all the prompts + responses to a verifier for scoring, which can
+ be reward model or a rule-based function. Then sort them in pairs to
+ form a training batch.
+3. Use this training batch to train the actor model using DPO. During
+ the process, a reference policy is needed.
+
+Step 1: What are the multi-machine multi-GPU computations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Sample Generator**
+
+Implementation details:
+
+.. code:: python
+
+ from verl.single_controller.base import Worker
+ from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
+ import ray
+
+ @ray.remote
+ class SampleGenerator(Worker):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ def generate_sequences(self, data):
+ pass
+
+Here, ``SampleGenerator`` can be viewed as a multi-process pulled up by
+``torchrun``, with each process running the same code (SPMD).
+``SampleGenerator`` needs to implement a ``generate_sequences`` API for
+the control flow to call. The implementation details inside can use any
+inference engine including vllm, sglang and huggingface. Users can
+largely reuse the code in
+verl/verl/workers/rollout/vllm_rollout/vllm_rollout.py and we won't
+go into details here.
+
+**ReferencePolicy inference**
+
+API: compute reference log probability
+
+.. code:: python
+
+ from verl.single_controller.base import Worker
+ import ray
+
+ @ray.remote
+ class ReferencePolicy(Worker):
+ def __init__(self):
+ super().__init__()
+ self.model = Model()
+
+ def infer(self, data):
+ return self.model(data)
+
+**Actor update**
+
+API: Update actor model parameters
+
+.. code:: python
+
+ from verl.single_controller.base import Worker
+ import ray
+
+ @ray.remote
+ class DPOActor(Worker):
+ def __init__(self):
+ super().__init__()
+ self.model = Model()
+ self.model = FSDP(self.model) # or other distributed strategy
+ self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
+ self.loss_fn = xxx
+
+ def update(self, data):
+ self.optimizer.zero_grad()
+ logits = self.model(data)
+ loss = self.loss_fn(logits)
+ loss.backward()
+ self.optimizer.step()
+
+**Notes: How to distinguish between control processes and distributed computation processes**
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Control processes are generally functions directly decorated with
+ ``@ray.remote``
+- Computation processes are all wrapped into a ``RayWorkerGroup``.
+
+Users can reuse most of the distribtued computation logics implemented
+in PPO algorithm, including FSDP and Megatron-LM backend in
+verl/verl/trainer/ppo.
+
+Step 2: Based on different distributed scenarios, implement single-process control of multi-process data interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**The core problem to solve here is how a single process sends data to
+multiple processes, drives multi-process computation, and how the
+control process obtains the results of multi-process computation.**
+First, we initialize the multi-process ``WorkerGroup`` in the control
+process.
+
+.. code:: python
+
+ @ray.remote(num_cpus=1)
+ def main_task(config):
+ # construct SampleGenerator
+ resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
+ ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
+ # put SampleGenerator onto resource pool
+ worker_group = RayWorkerGroup(resource_pool, ray_cls)
+
+ # construct reference policy
+
+As we can see, in the control process, multiple processes are wrapped
+into a ``RayWorkerGroup``. Inside this ``WorkerGroup``, there is a
+``self._workers`` member, where each worker is a RayActor
+(https://docs.ray.io/en/latest/ray-core/actors.html) of SampleGenerator.
+ray_trainer.md also provide an implementation of
+``MegatronRayWorkerGroup``.
+
+Assuming the model is distributed using FSDP, and there is a batch of
+data on the control process, for data parallelism, the underlying
+calling process is:
+
+.. code:: python
+
+ data = xxx
+ data_list = data.chunk(dp_size)
+
+ output = []
+ for d in data_list:
+ # worker_group._workers[i] is a SampleGenerator
+ output.append(worker_group._workers[i].generate_sequences.remote(d))
+
+ output = ray.get(output)
+ output = torch.cat(output)
+
+Single process calling multiple processes involves the following 3
+steps:
+
+1. Split the data into DP parts on the control process.
+2. Send the data to remote, call the remote computation through RPC, and
+ utilize multi-process computation.
+3. Obtain the computation results of each worker on the control process
+ and merge them.
+
+Frequently calling these 3 steps on the controller process greatly hurts
+code readability. **In verl, we have abstracted and encapsulated these 3
+steps, so that the worker's method + dispatch + collect can be
+registered into the worker_group**
+
+.. code:: python
+
+ from verl.single_controller.base.decorator import register
+
+ def dispatch_data(worker_group, data):
+ return data.chunk(worker_group.world_size)
+
+ def collect_data(worker_group, data):
+ return torch.cat(data)
+
+ dispatch_mode = {
+ 'dispatch_fn': dispatch_data,
+ 'collect_fn': collect_data
+ }
+
+ @register(dispatch_mode=dispatch_mode)
+ def generate_sequences(self, data):
+ pass
+
+In this way, we can directly call the method inside the worker through
+the ``worker_group`` on the control (driver) process (which is a single
+process):
+
+.. code:: python
+
+ output = worker_group.generate_sequences(data)
+
+This single line includes data splitting, data distribution and
+computation, and data collection.
+
+Furthermore, the model parallelism size of each model is usually fixed,
+including dp, tp, pp. So for these common distributed scenarios, we have
+pre-implemented specific dispatch and collect methods,in `decorator.py `_, which can be directly used to wrap the computations.
+
+.. code:: python
+
+ from verl.single_controller.base.decorator import register, Dispatch
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def generate_sequences(self, data: DataProto) -> DataProto:
+ pass
+
+Here it requires the data interface to be ``DataProto``. Definition of
+``DataProto`` is in `protocol.py `_.
+
+Step 3: Main training loop
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+With the above training flows, we can implement the algorithm's control
+flow. It is recommended that ``main_task`` is also a ray remote process.
+
+.. code:: python
+
+ @ray.remote(num_cpus=1)
+ def main_task(config):
+ # construct SampleGenerator
+ resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
+ ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
+ # put SampleGenerator onto resource pool
+ sample_gen = RayWorkerGroup(resource_pool, ray_cls)
+
+ # construct reference policy
+ ray_cls = RayClassWithInitArgs(ReferencePolicy)
+ ref_policy = RayWorkerGroup(resource_pool, ray_cls)
+
+ # construct actor
+ ray_cls = RayClassWithInitArgs(DPOActor)
+ dpo_policy = RayWorkerGroup(resource_pool, ray_cls)
+
+ dataloader = DataLoader()
+
+ for data in dataloader:
+ # generate data
+ data = sample_gen.generate_sequences(data)
+ # generate scores for each data
+ data = generate_scores(data)
+ # generate pairwise data using scores
+ data = generate_pairwise_data(data)
+ # generate ref_log_prob
+ data.batch['ref_log_prob'] = ref_policy.infer(data)
+ # update using dpo
+ dpo_policy.update(data)
+ # logging
+
+Here, different ``WorkerGroups`` can be placed in the same resource pool or
+in different resource pools using ``create_colocated_worker_cls``
+similar as in `ray_trainer.py `_.
diff --git a/code/RL_model/verl/verl_train/docs/advance/fp8.md b/code/RL_model/verl/verl_train/docs/advance/fp8.md
new file mode 100644
index 0000000000000000000000000000000000000000..0006392d7cd8ae3303527868900fb3254a9f1740
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/fp8.md
@@ -0,0 +1,107 @@
+# FP8 rollout for verl
+
+Last updated: 12/4/2025
+
+This document introduces FP8 rollout in verl.
+
+
+We monkey patch several vLLM functions to enable FP8 rollout for reinforcement learning:
+
+1. **Quantize weights**: Quantize model weights on-the-fly from higher-precision formats to FP8.
+2. **Process weights after loading**: For vLLM, we replace the `vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading` function to handle weight processing after quantization. For SGLang, this patch is not needed as it natively supports loading quantized weights.
+
+
+## Support Matrix
+- FP8 blockwise quantization for rollout
+ - Used in Deepseek,
+which is 1x128 quantization for activations and 128x128 quantization for model weights
+- Dense models and MoE models
+- Async rollout interfaces
+- vLLM 0.10.x & vLLM 0.11 & SGlang 0.5.5
+- FSDP and Megatron training backends
+
+## Experiments and Outcomes
+### Qwen3-8B-Base Dense Model
+
+**Configuration**
+- DAPO recipe. AIME24 online validation.
+- vLLM(FP8 spmd rollout) + FSDP
+ - Note that SPMD rollout has been deprecated, so we removed the FP8 SPMD rollout.
+- Prompt batch size 32, n=16.
+- Rollout batch size: 32\*3*16
+- Train_batch_size & ppo_mini_batch_size 32
+- Max response length 20K
+- Token-level TIS, C=2
+- 8*H100
+- vLLM 0.10.0+CUDA 12.6 vs vLLM 0.11.0+CUDA 12.9
+
+**Accuracy**
+
+*dark green: BF16, orange: FP8 rollout + token-level TIS, light green: FP8 rollout without TIS*
+
+Results and observations:
+- With TIS, FP8 rollout aligns with BF16
+- Obvious accuracy drop when TIS is not enabled
+- Higher mismatch kl but within acceptable range throughout the training
+
+
+**Performance**
+
+
+*green: BF16, orange: FP8 rollout + CUDA12.6 + DeepGemm, purple: FP8 rollout + CUDA 12.9 + DeepGemm*
+
+Results and observations:
+- FP8 rollout leads to around ~12% rollout speedup with CUDA 12.6 + DeepGemm
+- When upgrading to CUDA 12.9, speedup can be up to ~18%
+
+### Qwen3-30B-A3B-Base MoE Model
+
+**Configuration**
+- DAPO recipe. AIME24 online validation.
+- FP8 async rollout, vLLM+FSDP
+- Prompt batch size 32
+- Rollout batch size: 32\*3*16
+- Train_batch_size & ppo_mini_batch_size 32
+- Max response length 20K
+- Token-level TIS, C=2
+- 2\*8*H100
+- vLLM 0.10.0+CUDA 12.6
+
+Please refer to `recipe/dapo/run_dapo_qwen3_moe_30b_vllm_fp8_rollout.sh`
+
+**Accuracy**
+
+*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS*
+
+Results and observations:
+- Rollout & training distribution mismatch is in general higher for MoE
+- Rollout correction required even for BF16
+- FP8 rollout with token-level TIS aligns with BF16
+
+
+**Performance**
+
+
+*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS*
+
+Results and observations:
+- FP8 rollout : over 35% rollout speedup
+- Expecting more perf gain with CUDA 12.9
+
+## Usage
+
+FP8 can be enabled in the config file `verl/trainer/config/ppo_megatron_trainer.yaml`:
+
+```
+ rollout:
+ quantization: "fp8"
+```
+
+Or it can be enabled by command line:
+- `actor_rollout_ref.rollout.quantization=fp8`
+
+Please refer to `recipe/dapo/run_dapo_qwen3_moe_30b_vllm_fp8_rollout.sh`
diff --git a/code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst b/code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst
new file mode 100644
index 0000000000000000000000000000000000000000..181e109082262f26334034337c5915d522049759
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst
@@ -0,0 +1,97 @@
+
+Add models with the FSDP backend
+==================================
+
+Last updated: 02/09/2025.
+
+Model
+--------------------------
+
+In principle, our FSDP backend can support any HF model and we can
+sychronoize the actor model weight with vLLM using `hf_weight_loader.py` under `third_party/vllm`.
+However, ``hf_weight_loader`` is will gather the full state_dict of a
+model during synchronization, which may cause OOM. We suggest using
+``dtensor_weight_loader`` which gather the full model parameter layer by
+layer to reduce the peak memory usage. We already support dtensor weight
+loader for the models below in `dtensor_weight_loader.py` under `third_party/vllm`:
+
+- ``GPT2LMHeadModel``
+- ``LlamaForCausalLM``
+- ``LLaMAForCausalLM``
+- ``MistralForCausalLM``
+- ``InternLMForCausalLM``
+- ``AquilaModel``
+- ``AquilaForCausalLM``
+- ``Phi3ForCausalLM``
+- ``GemmaForCausalLM``
+- ``Gemma2ForCausalLM``
+- ``GPTBigCodeForCausalLM``
+- ``Starcoder2ForCausalLM``
+- ``Qwen2ForCausalLM``
+- ``DeepseekV2ForCausalLM``
+
+To implement ``dtensor_weight_loader`` of a model that's supported in
+vLLM, follow the guide of gemma model below:
+
+1. Copy the
+ ``load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]])`` from the vllm model class
+ to ``dtensor_weight_loaders.py``
+2. Modify the arguments to
+ ``(actor_weights: Dict, vllm_model: nn.Module)``
+3. Replace the ``self`` to ``vllm_model``
+4. Add the
+ ``local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)``
+ before each ``param = params_dict[name]`` and modify the following
+ weight loading using ``local_loaded_weight``.
+5. Register the implemented dtensor weight loader to ``__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__``.
+
+.. code-block:: diff
+
+ - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ + def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+ - params_dict = dict(self.named_parameters())
+ + params_dict = dict(vllm_model.named_parameters())
+ loaded_params = set()
+ - for name, loaded_weight in weights:
+ + for name, loaded_weight in actor_weights.items():
+ for (param_name, shard_name, shard_id) in stacked_params_mapping:
+ if shard_name not in name:
+ continue
+ name = name.replace(shard_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ + local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ - weight_loader(param, loaded_weight, shard_id)
+ + weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+ break
+ else:
+ # lm_head is not used in vllm as it is tied with embed_token.
+ # To prevent errors, skip loading lm_head.weight.
+ if "lm_head.weight" in name:
+ continue
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ + local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ - weight_loader(param, loaded_weight)
+ + weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+ loaded_params.add(name)
+ unloaded_params = params_dict.keys() - loaded_params
+ if unloaded_params:
+ raise RuntimeError(
+ "Some weights are not initialized from checkpoints: "
+ f"{unloaded_params}")
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/advance/fully_async.md b/code/RL_model/verl/verl_train/docs/advance/fully_async.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c03bac6e86eac1f98337ed798b22311dc16c2d8
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/fully_async.md
@@ -0,0 +1,595 @@
+# Recipe: Fully Async Policy Trainer
+
+**Author:** `https://github.com/meituan-search`
+
+Last updated: 12/25/2025.
+
+This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter,
+supporting asynchronous sample generation and training.
+Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs,
+without significantly affecting the results.
+
+## Introduction
+
+### Background
+
+The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more
+flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training
+efficiency caused by long-tail problems.
+The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by
+designing a separated architecture and performing asynchronous training between rollout and train for one round.
+However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot
+completely eliminate the impact of long-tail on training efficiency.
+In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have
+been implemented based on the separated architecture and have achieved gains.
+We borrow from their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and
+partial
+rollout training.
+By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy
+can significantly improve training efficiency.
+
+> Magistral https://arxiv.org/abs/2506.10910
+>
+> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language
+> Reasoning https://arxiv.org/abs/2505.24298
+>
+> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream
+> Generation https://arxiv.org/abs/2504.15930
+>
+> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663
+
+### Core Contributions
+
+- **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to
+ specify the resources they occupy separately.
+- **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples.
+- **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to
+ multiple steps, making the asynchronous solution more flexible.
+- **NCCL Parameter Synchronization**: Based on the nccl communication primitive, refer to [checkpoint-engine](https://github.com/MoonshotAI/checkpoint-engine) to
+ achieve efficient parameter synchronization between Rollouter and Trainer.
+- **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single
+ sample as the minimum transmission unit.
+- **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it
+ supports training with samples generated by old parameters.
+- **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter
+ synchronization, by adding `sleep() and resume()` logic, it
+ saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for
+ ongoing tasks to finish during parameter synchronization.
+
+Currently, the supported usage mode is Megatron/FSDP+vLLM/SGLang. vLLM/SGLang must use the server mode based on AgentLoop.
+
+## Design
+
+The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four
+parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer.
+
+
+
+1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the
+ production speed controlled by freshness.
+2. MessageQueue is used to temporarily store samples generated by Rollouter.
+3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size`
+ samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers
+ a parameter synchronization with Rollouter.
+4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability.
+
+The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for
+rollout cannot solve the idleness caused by long-tail samples.
+After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources
+are used),
+but the overlap in their time consumption reduces the end-to-end time consumption.
+
+
+
+## Usage
+
+### Parameter Description
+
+| super params | implication |
+| ---------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
+| `trainer.nnodes` | Number of nodes for Trainer |
+| `trainer.n_gpus_per_node` | Number of GPUs per node for Trainer |
+| `rollout.nnodes` | Number of nodes for Rollouter |
+| `rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter |
+| `data.train_batch_size` | In the fully async strategy, this value is not effective (default is 0) |
+| `data.gen_batch_size` | In the fully async strategy, uses streaming sample production logic (default is 1) |
+| `rollout.total_rollout_steps` | Total number of rollout samples |
+| `rollout.test_freq` | How many times Rollouter updates parameters before performing a validation |
+| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus |
+| `async_training.require_batches` | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once |
+| `async_training.trigger_parameter_sync_step` | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization |
+| `async_training.staleness_threshold` | Freshness control |
+| `async_training.partial_rollout` | Whether to perform partial_rollout |
+| `async_training.use_rollout_log_probs` | Use log_probs generated by rollout |
+| `async_training.compute_prox_log_prob` | Whether to compute log_prob using the training model's parameters during the training phase |
+| `async_training.checkpoint_engine.enable` | Whether to use checkpoint_engine for accelerating, default `True` |
+| `async_training.checkpoint_engine.overlap_broadcast_and_consume` | When use checkpoint_engine, whether to overlap broadcast and load_weights, default `False` |
+| `async_training.checkpoint_engine.device_buffer_size_M` | When use checkpoint_engine, the user-specific bucket size (MB), default `4096` |
+| `async_training.use_trainer_do_validate` | Whether use trainer node to do validate process, default `False`|
+
+**Further Explanation:**
+
+- `rollout.total_rollout_steps`
+
+ Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step:
+ `rollout.total_rollout_steps = data.train_batch_size * step`.
+
+- `async_training.trigger_parameter_sync_step`
+
+ In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches
+ `require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter.
+ Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process
+ `trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples.
+ To fairly compare speed with colocate, trigger_parameter_sync_step should be set to
+ `data.train_batch_size / (require_batches * ppo_mini_batch_size)`.
+
+- `async_training.staleness_threshold`
+
+ In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used.
+
+ - staleness_threshold=0, indicates synchronous training.
+ Rollouter will generate a fixed number of samples between two parameter updates, the sample count is:
+ $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$
+ - staleness_threshold>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous
+ calls.
+ Rollouter will generate at most the following number of samples between two parameter updates:
+ $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$
+
+ num_staleness_sample represents the number of stale samples generated in excess during the last rollout.
+
+ Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower,
+ trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples.
+ When rollout is fast enough, setting staleness_threshold to 1 is basically equivalent to one_step_off policy.
+ To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1.
+
+- `async_training.partial_rollout`
+
+ partial_rollout only actually takes effect when staleness_threshold>0.
+
+- `async_training.use_rollout_log_probs`
+
+ In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to
+ the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling,
+ old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm
+ correctness. In the fully
+ async strategy, we default to old_log_prob being calculated by rollout rather than by trainer.
+
+- `async_training.require_batches`
+
+ In streaming training, require_batches should be set to 1, indicating that training is performed after producing
+ enough ppo_mini_batch_size samples.
+ In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can
+ cause training instability and longer response lengths.
+ Here, we additionally provide require_batches for streaming distribution and control the number of samples
+ participating in training at once.
+
+- `async_training.compute_prox_log_prob` (experimental)
+
+ During the training process, we observed that metrics and response lengths may become unstable in the later
+ stages of training. To mitigate this issue, we can use
+ the [Rollout Importance Sampling](https://verl.readthedocs.io/en/latest/advance/rollout_is.html)
+ technique for importance sampling. To utilize Rollout Importance Sampling, we need to compute log_prob using
+ the training engine, which requires enabling this switch.
+ Additionally, when compute_prox_log_prob and Rollout Importance Sampling are enabled under mode d
+ (async stream pipeline with partial rollout), our implementation approximates `Areal's Decoupled PPO`.
+
+- `async_training.checkpoint_engine.enable`
+
+ Enabling the checkpoint engine generally reduces synchronization time overhead by more than 60% compared to
+ the original per-tensor parameter synchronization method. However, assembling buckets incurs additional
+ temporary GPU memory overhead.
+
+- `async_training.checkpoint_engine.overlap_broadcast_and_consume`
+
+ Enabling pipeline between the broadcast and load_weights parameters will allocate additional GPU memory.
+ Since the main time consumption for parameter synchronization is not in the broadcast and load_weights phases,
+ but in the parameter generation phase (by megatron or FSDP), this option is off by default.
+
+- `async_training.checkpoint_engine.device_buffer_size_M`
+
+ It controls the size of the memory buffer used for synchronization when the checkpoint-engine is enabled.
+ The actual `bucket_size` = `max(device_buffer_size_M, maximum parameter tensor size)`.
+
+ - When enable `overlap_broadcast_and_consume`, the additional device memory overhead of
+ trainer rank is `3 * bucket_size`and rollout rank is `2 * bucket_size`。
+ - When disable `overlap_broadcast_and_consume`, the additional device memory overhead of
+ trainer rank is `2 * bucket_size`and rollout rank is `1 * bucket_size`。
+
+* `async_training.use_trainer_do_validate`
+
+ It controls whether to use the trainer's `do_validate` method for validation.
+ If set to True, the trainer will perform validation after each parameter update. It can reduce the validation time
+ overhead and trainer node idle time.
+ If set to False, the trainer will not perform validation.
+
+### Supported Modes
+
+1. on policy pipeline:
+
+ 1. **trigger_parameter_sync_step=1, staleness_threshold=0**
+ 2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for
+ training, and after training completes, Trainer and Rollouter perform a parameter synchronization;
+ 3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill
+ idle resources, causing some resource waste.
+ 4. As shown in figure a;
+
+2. stream off policy pipeline:
+
+ 1. **trigger_parameter_sync_step>1, staleness_threshold=0**
+ 2. Synchronous streaming training will be performed. Rollouter produces
+ `require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local
+ training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training
+ trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization;
+ 3. Compared to a, since more samples are generated at once, resource idleness will be lower.
+ 4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples,
+ train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter
+ update, rollout waits for training to complete.
+ 5. As shown in figure b;
+
+3. async stream pipeline with stale samples:
+
+ 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False**
+ 2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number
+ of samples generated may be less than this value depending on rollout speed).
+ 3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples
+ before parameter synchronization for immediate use by Trainer after synchronization.
+ When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete
+ and not add new tasks;
+ 4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the
+ first batch rollout to finish, but will have the time to wait for active tasks to finish.
+ 5. As shown in figure c;
+
+4. async stream pipeline with partial rollout:
+ 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True**
+ 2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will
+ interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be
+ generated after synchronization. This reduces the time to wait for active tasks to finish.
+ 3. As shown in figure d;
+
+
+
+### Key Metrics
+
+| metrics | implication |
+| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
+| `trainer/idle_ratio` | Trainer idle rate |
+| `rollouter/idle_ratio` | Rollouter idle rate |
+| `fully_async/count/stale_samples_processed` | Total number of old samples used in training |
+| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories) |
+| `fully_async/partial/total_partial_num` | Number of partial samples processed by Trainer between two trigger_parameter_sync_step |
+| `fully_async/partial/partial_ratio` | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step |
+| `fully_async/partial/max_partial_span` | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step |
+
+### Parameter Tuning Recommendations
+
+- Resource Allocation and Adjustment:
+
+ - Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource
+ allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire
+ training process,
+ avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource
+ allocation can be adjusted based on the idle time of rollout and train during actual training,
+ which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and
+ trainer/idle_ratio is low,
+ Trainer resources should be increased and Rollouter resources should be reduced, and vice versa.
+
+- Key Parameters:
+
+ - staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It
+ is recommended to set it to less than 1.
+ - require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and
+ the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample
+ processing;
+ - trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent
+ parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in
+ low resource utilization.
+ The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy.
+ - rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small.
+
+- Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at
+ different levels, suitable for tasks in different scenarios.
+ - For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed
+ requirements, the on policy pipeline mode (Mode 1) can be tried.
+ - For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy
+ pipeline mode can be tried. That is, by
+ setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization
+ mechanism (staleness_threshold=0) (Mode 2).
+ - For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and
+ staleness, setting staleness_threshold>
+ 0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4).
+
+### Quick Start
+
+```shell
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+ export VLLM_USE_V1=1
+ return_raw_chat="True"
+fi
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=10
+staleness_threshold=0
+trigger_parameter_sync_step=16
+partial_rollout=False
+
+
+python -m verl.experimental.fully_async_policy.fully_async_main \
+ train_batch_size=${train_prompt_bsz} \
+ data.gen_batch_size=${gen_prompt_bsz} \
+ data.return_raw_chat=${return_raw_chat} \
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+ actor_rollout_ref.actor.strategy=fsdp2 \
+ critic.strategy=fsdp2 \
+ actor_rollout_ref.hybrid_engine=False \
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+ actor_rollout_ref.rollout.name=${rollout_name} \
+ actor_rollout_ref.rollout.mode=${rollout_mode} \
+ actor_rollout_ref.rollout.calculate_log_probs=True \
+ trainer.nnodes="${NNODES_TRAIN}" \
+ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+ rollout.nnodes="${NNODES_ROLLOUT}" \
+ rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+ rollout.total_rollout_steps="${total_rollout_steps}" \
+ rollout.test_freq="${test_freq}" \
+ async_training.staleness_threshold="${staleness_threshold}" \
+ async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+ async_training.partial_rollout="${partial_rollout}"
+```
+
+## Experiments
+
+### Asynchronous Training on 7B Model
+
+We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources.
+Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards,
+64 cards, and 128 cards without significantly affecting experimental results.
+
+- Machine: H20
+- Model: Qwen2.5-Math-7B
+- Rollout length: max_response_length FSDP2: 28K tokens;
+- Algorithm: DAPO
+- Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+- Engine: vLLM + FSDP2
+- rollout.n: 16
+- ppo_mini_batch_size: 32
+- test_freq: 20
+
+- colocate sync:
+
+ - step: 400
+ - train_batch_size: 512
+
+- fully_async_policy
+ - total_rollout_steps: 512\*400
+ - require_batches: 4
+ - trigger_parameter_sync_step: 4
+ - staleness_threshold: 0.5
+ - partial_rollout: True
+
+| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 |
+| :----------------: | :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 269.80 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 |
+| fully_async_policy | 16:16 | 294.77 | 21.26 | \ | 313.81 | 7h 58m
(1.72x) | 16h 21m
(1.70x) | 1d 0h 53m
(2.31x) | 1d 9h 26m
(2.66x) | max: 0.3302
last: 0.2333 |
+| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 |
+| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 |
+| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 |
+| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg
+
+### 128-card 7B Asynchronous Mode Experiment
+
+We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async.
+We can see that the benefit brought by streaming is approximately 1.6x, and after combining staleness and
+partial_rollout, the benefit reaches 2.35x.
+
+| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 |
+| :---------------------------------------------------------------------------------------------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+| colocate sync | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 |
+| `stream off policy pipeline`
(+fully async: trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 |
+| `async stream pipeline with stale samples`
(+staleness_threshold=0.5) | | | | | | | | | |
+| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
+
+### 128-card Stale Ablation Experiment
+
+Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training
+efficiency.
+We found that the larger the staleness, the more obvious the final gains.
+We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps
+increase, the response length changes significantly, causing training instability.
+Further analysis and optimization are needed for this issue.
+
+| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 |
+| :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 |
+| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 |
+| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 |
+| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
+
+### 128-card 7B require_batches Ablation Experiment
+
+In multiple tests, we found that the number of samples issued each time in streaming affects the response length during
+training, which in turn affects training time. We verified the impact on results by modifying
+`async_training.require_batches`.
+
+| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 |
+| :-------------: | :----: | :---: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 |
+| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 |
+| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg
+
+### 30B Model Mode Experiment
+
+We achieved a 1.7x performance improvement with `async stream pipeline with staleness samples` strategy on the
+Qwen3-30B-A3B-Base model compared to the colocate setup. It is worth noting that this is far from the upper limit of
+performance gains achievable through asynchrony. Firstly, the comparative experiments used a maximum response length of
+only 8k, which is much shorter than the 20k sequence length in previous experiments, resulting in a less pronounced
+rollout tail effect. Secondly, we adopted a highly skewed resource allocation, with rollout using 96 GPUs and trainer
+using 32 GPUs, which is not an optimal configuration. During the experiments, we observed that the current verl
+implementation imposes certain constraints, such as requiring data to be evenly divisible by the number of GPUs, making
+resource adjustment less flexible. Additionally, as asynchronous training and deployment accelerate, the performance gap
+is gradually narrowing. Therefore, enabling more flexible resource allocation and dynamic resource adjustment in the
+future will be our next focus.
+
+- Machine: H20
+- Model: Qwen3-30B-A3B-Base
+- Rollout length: max_response_length : 8K tokens;
+- Algorithm: GRPO
+- Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+- Engine: vLLM + Megatron
+- rollout.n: 16
+- ppo_mini_batch_size: 128
+- test_freq: 20
+
+- colocate sync:
+
+ - step:400
+ - train_batch_size: 512
+
+- fully_async_policy
+ - total_rollout_steps: 512\*400
+ - trigger_parameter_sync_step: 512/128 = 4
+ - staleness_threshold: 0.5
+ - partial_rollout: True
+
+| Training Mode | Resource Allocation | Step | Gen | Old Log Prob | Ref | Update Actor | Total Time 100 Step | Total Time 200 Step | Total Time 300 Step | Total Time 400 Step | Acc/Mean@1 |
+| ------------------ | ------------------- | ------ | ------ | ------------ | ----- | ------------ | ------------------- | ------------------- | ------------------- | ------------------- | --------------------------- |
+| Colocate Sync | 128 | 497.89 | 348.05 | 28.73 | 20.86 | 86.27 | 13h 36m | 1d 3h 48m | 1d 19h 4m | 2d 11h 39m | max: 0.3500
last: 0.3208 |
+| Fully Async Policy | 96:32 | 282.75 | 22.06 | \ | 50.05 | 206.63 | 6h 45m (2.01x) | 14h 48m (1.88x) | 1d 0h 9m (1.78x) | 1d 10h 41m (1.72x) | max: 0.3813
last: 0.3448 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-30B?nw=nwuserhouzg | | |
+
+### checkpoint-engine Ablation Experiment
+
+We tested the single-step parameter synchronization time of the checkpoint-engine on three models: Qwen2.5-Math-7B, Qwen3-30B-A3B, and Qwen3-235B-A22B, using default checkpoint-engine configurations. All experiments were performed on H20 machines, and the Megatron engine was used for training.
+| model | trainer rank | rollout rank | checkpoint-engine | total sync time |
+|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|
+| Qwen2.5-Math-7B | 4 | 4 | False | 0.12s |
+| Qwen2.5-Math-7B | 4 | 4 | True | 0.02s |
+| Qwen3-30B-A3B | 16 | 16 | False | 15.76s |
+| Qwen3-30B-A3B | 16 | 16 | True | 4.38s |
+| Qwen3-235B-A22B | 64 | 64 | False | 58.57s |
+| Qwen3-235B-A22B | 64 | 64 | True | 23.70s |
+
+### use_trainer_do_validate Experiment
+
+We tested the effect of setting `use_trainer_do_validate=True` on the training process. The results show that setting
+this parameter to True can reduce the validation time overhead and trainer node idle time.
+We used Qwen2.5-Math-7B to verify the benefits of `use_trainer_do_validate=True` on the training process, we achieved about 2x performance improvement on validation time, and the trainer node idle time is reduced by about 40%.
+
+* Machine: H20
+* Model: Qwen2.5-Math-7B
+* Rollout length: max_response_length FSDP2: 10K tokens;
+* Algorithm: DAPO
+* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+* Engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 10
+
+* fully_async_policy
+ * total_rollout_steps: 512*400
+ * require_batches: 4
+ * trigger_parameter_sync_step: 4
+ * staleness_threshold: 0.5
+ * partial_rollout: True
+
+| training mode | resource allocation | step | gen | old_log_prob | update_actor | validate time | total time
50 step | acc/mean@2 |
+|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|
+| colocate sync | 16 | 484.623 | 52.939 | 0 | 430.263 | 205.080 | 7h9m | 22.6 |
+| fully_async_policy | 8:8 | 489.953 | 52.622 | 0 | 435.874 | 95.699 | 7h2m | 21.0 |
+
+
+## Multi-Turn Tool Calling
+
+Referencing **recipe/retool** and **ToolAgentLoop**, we implemented **AsyncPartialToolAgentLoop**, a multi-turn
+tool-calling loop that supports partial_rollout for **fully_async_policy**.
+
+### Core Design
+
+`AsyncPartialToolAgentLoop` inherits from `ToolAgentLoop` and is adapted for the asynchronous training mode of
+`fully_async_policy`. When `partial_rollout=True`, the Rollouter interrupts ongoing generation tasks before
+synchronizing parameters with the Trainer. `AsyncPartialToolAgentLoop` is capable of:
+
+1. **Interrupting Tasks**: Responding to an interrupt signal to save the current state. Currently, interruptions occur
+ during the `GENERATING` process or after other states have completed.
+2. **Resuming Tasks**: Resuming execution from the saved state after parameter synchronization is complete, rather than
+ starting over.
+
+### How to Use
+
+RL training with multi-turn tool calling in `fully_async_policy` is similar to `recipe/retool`. It is enabled by
+specifying `multi_turn` configurations in the config file.
+
+1. **SFT Stage**: First, the model should undergo SFT to learn how to follow tool-calling format instructions.
+2. **Multi-turn Configuration**: In the `fully_async_policy` training configuration, set the following parameters:
+ ```yaml
+ actor_rollout_ref:
+ rollout:
+ multi_turn:
+ enable: True # AsyncPartialToolAgentLoop will be used by default in fully_async_policy mode
+ # Other multi_turn related configurations
+ ```
+3. **Async Parameters**: To improve efficiency, enable `partial_rollout` and `staleness_threshold` when using multi-turn
+ tool calling:
+ ```yaml
+ async_training:
+ partial_rollout: True
+ staleness_threshold: 0.5
+ # Other async parameters
+ ```
+4. **Example**: See `recipe/fully_async_policy/shell/dapo_7b_async_retool.sh`.
+
+### Experimental Results
+
+To validate the performance of `fully_async_policy` on multi-turn tool-calling tasks, we compared it with the standard
+`colocate` synchronous mode. Key parameter settings are as follows.
+
+- **SFT Model**: Based on `Qwen2.5-7B-Instruct`, trained for 6 epochs on the `ReTool-SFT` dataset
+- **RL Algorithm**: DAPO
+- **Dataset**:
+ - Train: `DAPO-Math-17k`
+ - Test: `aime_2025`
+- **Resource and Mode Comparison**:
+ - `colocate sync`: 32 H20 gpus
+ - `fully_async_policy`: 16 gpus for Trainer + 16 gpus for Rollouter
+- **Key Configurations**:
+ 1. **Tool Calling Configuration**:
+ - `multi_turn.enable: True`
+ - `multi_turn.max_user_turns: 16`
+ - `multi_turn.max_assistant_turns: 16`
+ - `multi_turn.tool_config_path: recipe/retool/sandbox_fusion_tool_config.yaml`
+ 2. **`colocate sync` Configuration**:
+ - `ppo_mini_batch_size: 16`
+ - `train_batch_size: 64`
+ 3. **`fully_async_policy` Configuration**:
+ - `ppo_mini_batch_size: 16`
+ - `trigger_parameter_sync_step: 4`
+ - `require_batches: 1`
+ - `staleness_threshold: 1`
+ - `partial_rollout: True`
+
+| training mode | Resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | aime_2025
acc/mean@30 |
+| :----------------: | :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :-------------------------: |
+| colocate | 32 | 375.47 | 228.03 | 35.19 | 111.84 | 9h 46m | 22h 28m | start:0.1078
last:0.2056 |
+| fully_async_policy | 16: 16 | 221.36 | 40.59 | \ | 179.58 | 6h 19m
(1.55x) | 14h 4m
(1.60x) | start:0.11
last:0.2044 |
+
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-multiturn-tool?nw=nwuserhouzg
+
+## Future Plans
+- Transfer queue integration
+- Asynchronous parameter synchronization
diff --git a/code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md b/code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b59f936728e2142df8765b6f886804069566cd9
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md
@@ -0,0 +1,193 @@
+# Use Prometheus and Grafana to Monitor Rollout
+
+**Author:** `https://github.com/meituan-search`
+
+Last updated: 12/05/2025.
+
+Monitor the rollout computation process using Prometheus and Grafana when using verl to enhance system observability and facilitate further performance optimization.
+
+We provide an additional training monitoring capability, leveraging Prometheus and Grafana to display rollout information during training and enhance system observability to facilitate further performance optimization.
+
+The system automatically configures Prometheus to scrape metrics from rollout servers, eliminating manual configuration steps.
+
+## Overview
+
+The figures below show the performance of Qwen235B on the AIME2024 dataset with a response length of 20k, where the emergence of a long-tail problem is clearly observable.
+
+
+
+The following figure presents the fully asynchronous training of the Qwen235B model. Here, resource idleness is distinctly noticeable, indicating that rollout resources can be reduced.
+
+
+
+Through the above two examples, we also illustrate the necessity of system observability.
+
+## Architecture Overview
+
+The overall workflow consists of the following steps:
+
+1. **Multi-node Ray Cluster Setup**: Start Ray cluster across multiple nodes with Grafana and Prometheus information configured in environment variables on the master node
+2. **Start Grafana Service**: Launch Grafana on the master node for visualization of monitoring dashboards
+3. **Start Prometheus Service**: Launch Prometheus on the master node for metrics collection and storage
+4. **verl Async Rollout Mode**: verl uses async rollout mode to obtain rollout server ports and IP addresses
+5. **Automatic Prometheus Configuration**: verl automatically rewrites the Prometheus configuration to add monitoring for rollout servers and notifies Prometheus to reload the configuration
+6. **Metrics Collection**: After program execution, metrics can be viewed in Prometheus
+7. **Dashboard Visualization**: Upload and view monitoring metrics in Grafana dashboards
+
+## Detailed Setup Steps
+
+### Step 1: Environment Variables and Start Ray Cluster
+
+First, set the necessary environment variables and start the Ray service.
+
+> Reference: [configure-manage-dashboard](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html)
+
+```bash
+# Master node environment variables
+export GF_SERVER_HTTP_PORT=3000 # Grafana service default port (customizable)
+export PROMETHEUS_PORT=9090 # Prometheus service default port (customizable)
+export RAY_HEAD_PORT=6379 # Ray master node port (customizable)
+export RAY_DASHBOARD_PORT=8265 # Ray dashboard default port (customizable)
+export GRAFANA_PATHS_DATA=/tmp/grafana # Grafana data storage directory (customizable)
+export RAY_GRAFANA_HOST="http://${master_ip}:${GF_SERVER_HTTP_PORT}" # Ray-associated Grafana address
+export RAY_PROMETHEUS_HOST="http://${master_ip}:${PROMETHEUS_PORT}" # Ray-associated Prometheus address
+
+# Start Ray on master node
+ray start --head --port=${RAY_HEAD_PORT} --dashboard-port=${RAY_DASHBOARD_PORT}
+
+# Start Ray on worker nodes
+ray start --address={master_addr}:${RAY_HEAD_PORT}
+```
+
+**Verification:** Visit `http://master_ip:8265` to confirm Ray has started successfully.
+
+### Step 2: Start Grafana (Visualization Dashboard)
+
+Grafana is used to display metrics collected by Prometheus (such as cache hit rate, throughput, etc.):
+
+```bash
+# Master node
+nohup grafana-server \
+ --config /tmp/ray/session_latest/metrics/grafana/grafana.ini \
+ --homepath /usr/share/grafana \
+ web > grafana.log 2>&1 &
+```
+
+**Verification:** Visit `http://master_ip:3000` to confirm Grafana has started successfully (default credentials: `admin/admin`).
+
+If you need to change the port, modify the `GF_SERVER_HTTP_PORT` environment variable, and grafana-server will automatically recognize it.
+
+### Step 3: Start Prometheus (Metrics Collection)
+
+Prometheus is responsible for scraping metrics from vLLM services and storing them as time-series data:
+
+```bash
+# Master node
+nohup prometheus \
+ --config.file /tmp/ray/session_latest/metrics/prometheus/prometheus.yml \
+ --web.enable-lifecycle \
+ --web.listen-address=:${PROMETHEUS_PORT} \
+ > prometheus.log 2>&1 &
+```
+
+**Verification:** Visit `http://master_ip:9090` to confirm Prometheus service has started successfully.
+
+### Step 4 & 5: Start verl Training
+
+Start verl training with the following parameters configured:
+
+**Required Configuration:**
+
+- `actor_rollout_ref.rollout.mode="async"`
+- `actor_rollout_ref.rollout.disable_log_stats=False`
+- `actor_rollout_ref.rollout.prometheus.enable=True`
+
+If use default port, this parameter can be omitted.
+
+- `actor_rollout_ref.rollout.prometheus.port=9090`
+
+If use default path, this parameter can be omitted.
+
+- `actor_rollout_ref.rollout.prometheus.file="/tmp/ray/session_latest/metrics/prometheus/prometheus.yml"`
+
+served_model_name uses `model_path.split("/")[-1]` for data statistics by default.
+Users can also customize other aliases:
+
+- `actor_rollout_ref.rollout.prometheus.served_model_name="Qwen3-235B"`
+
+**Shell Script Example:**
+
+```bash
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+
+rollout_mode="async"
+rollout_name="vllm" # Options: sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+ export VLLM_USE_V1=1
+ return_raw_chat="True"
+fi
+
+# Synchronous training
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+ --working-dir "${WORKING_DIR}" \
+ -- python3 -m verl.trainer.main_ppo \
+ data.return_raw_chat=${return_raw_chat} \
+ actor_rollout_ref.rollout.name=${rollout_name} \
+ actor_rollout_ref.rollout.mode=${rollout_mode} \
+ actor_rollout_ref.rollout.disable_log_stats=False \
+ actor_rollout_ref.rollout.prometheus.enable=True
+ ...
+
+# Asynchronous training
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+ --working-dir "${WORKING_DIR}" \
+ -- python3 verl.experimental.fully_async_policy.fully_async_main \
+ data.return_raw_chat=${return_raw_chat} \
+ actor_rollout_ref.rollout.name=${rollout_name} \
+ actor_rollout_ref.rollout.mode=${rollout_mode} \
+ actor_rollout_ref.rollout.disable_log_stats=False \
+ actor_rollout_ref.rollout.prometheus.enable=True
+ ...
+```
+
+### Step 6: View Metrics in Prometheus
+
+After task execution, verify that Prometheus is correctly collecting metrics.
+
+**Verification:** Visit the Prometheus interface at `http://master_ip:9090` and search for `vllm:` or `sglang:` to
+confirm metrics are being reported correctly.
+
+**Troubleshooting:**
+
+If no metrics appear:
+
+1. Check logs for `AgentLoopManager` to find the server port
+2. Visit `http://master_ip:server_port/metrics` to verify server metrics are available
+3. Confirm that `actor_rollout_ref.rollout.disable_log_stats=False` is set
+
+### Step 7: View Metrics in Grafana
+
+After task execution, log in to Grafana to view and customize monitoring dashboards.
+
+**Login:** Visit `http://master_ip:3000` (default credentials: `admin/admin`)
+
+**Import Dashboard:**
+
+1. Select `Dashboards` → `New` → `Import` → `Upload dashboard JSON file`
+2. Upload a pre-built dashboard JSON file
+
+**Available Dashboards:**
+
+- [vLLM Grafana Dashboard style 1](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana/vllm_grafana.json)
+- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/performance_statistics.json)
+- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/query_statistics.json)
+- [SGLang Grafana Dashboard](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json)
+
+## Additional Resources
+
+- [Ray Monitoring Documentation](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html)
+- [Prometheus Documentation](https://prometheus.io/docs/)
+- [Grafana Documentation](https://grafana.com/docs/)
+- [vLLM GitHub Repository](https://github.com/vllm-project/vllm)
+- [SGLang GitHub Repository](https://github.com/sgl-project/sglang)
diff --git a/code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst b/code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9a52e6017b7adc77b404398501587aff0e045129
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst
@@ -0,0 +1,20 @@
+Add models with the Megatron-LM backend
+=========================================
+
+Last updated: 04/25/2025.
+
+Model
+-----------
+
+
+If use latest verl, we have direct support of ``GPTModel`` for Megatron backend.
+You can use the similar way of using Megatron to pretrain custom models.
+We list the steps here:
+
+1. Find `model_initializer.py `_
+2. If your model is configurable by ``TransformerLayerSpec`` , you can
+ directly use ``GPTModel``. Otherwise, Please implement a new
+ ``ModelLayerSpec`` and ``ModelLayer`` here.
+3. Use the right ``LayerSpec`` , ``TransformerConfig`` and ``HuggingfaceConfig``
+ as arguments to initialize the GPTModel.
+4. Return the model at last.
diff --git a/code/RL_model/verl/verl_train/docs/advance/mtp.md b/code/RL_model/verl/verl_train/docs/advance/mtp.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4c5a25c631220d5307d11beb1de122f43312699
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/mtp.md
@@ -0,0 +1,105 @@
+# Guide to Using MTP in SFT/RL Training and Inference
+
+**Author**: `https://github.com/meituan-search`
+
+Last updated: 01/30/2026
+
+# 1. Scope of Support
+
+Currently, RL training can be performed on mimo-7B-RL, Qwen-next, and Deepseek series models based on the MTP architecture. The support rules for training and inference engines are as follows:
+
+- **Training Engine**: Only supports the `mbridge + megatron` combination; other training engines are not compatible at this time;
+
+- **Inference Engine**: Compatible with all engines, but the model must be in the corresponding engine's compatibility list;
+
+- **Dependency Versions**:
+
+ - mbridge: Use the specified branch: [https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp](https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp) (will be merged into the main branch in the future);
+
+ - megatron: Use the latest dev version (commit: [23e092f41ec8bc659020e401ddac9576c1cfed7e](https://github.com/NVIDIA/Megatron-LM/tree/23e092f41ec8bc659020e401ddac9576c1cfed7e)), which supports MTP + CP training methods.
+
+ - sglang: Use the specified branch: [https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor](https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor), [PR](https://github.com/sgl-project/sglang/pull/17870) , which fix the MTP update weights from tensor OOM issue.
+
+# 2. MTP Training Configuration (Core Parameters)
+
+The MTP training process can be flexibly controlled through the following configurations. All configurations are based on the `actor_rollout_ref.model.mtp` prefix:
+
+| Configuration Scenario | Core Parameters | Description |
+|------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------|
+| Load MTP Parameters Only | `enable=True` | VRAM usage will increase, but the exported parameters include the MTP module and can be directly used for online deployment |
+| Full-Parameter MTP Training | `enable=True`
`enable_train=True`
`mtp_loss_scaling_factor=0.1` | MTP Loss will apply to all model parameters |
+| MTP Parameter-Only Training | `enable=True`
`enable_train=True`
`detach_encoder=True` | Freeze the Encoder layer, update only MTP module parameters, MTP Loss applies only to MTP parameters |
+| MTP Accelerated Rollout | 1. vLLM configuration:
`enable=True`
`enable_rollout=True`
`method="mtp"`
`num_speculative_tokens=1`
2. SGLang configuration:
`enable=True`
`enable_rollout=True`
`speculative_algorithm="EAGLE"`
`speculative_num_steps=2`
`speculative_eagle_topk=2`
`speculative_num_draft_tokens=4` | Achieve inference acceleration during the Rollout phase based on MTP |
+
+# 3. Experimental Results
+
+The experiment was conducted as follows:
+
+* model = mimo-7B-math
+* max_response_length = 8k
+
+Experiment chart:
+
+
+
+The wandb link for the graph: [wandb](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
+
+**Scenarios with No Significant Effect**
+
+The following configurations will not have a noticeable impact on training results:
+
+1. The base model does not carry MTP parameters;
+
+2. The base model carries MTP parameters, but the MTP module is not trained;
+
+3. The base model carries MTP parameters and trains MTP, with `mtp_loss_scaling_factor=0`;
+
+4. The base model carries MTP parameters, trains MTP and detaches the encoder, with `mtp_loss_scaling_factor=0.1`.
+
+**Scenarios with Significant Effect**
+
+Only the following configuration will have a noticeable impact on training results:
+
+- The base model carries MTP parameters, MTP Loss applies to all model parameters, and `mtp_loss_scaling_factor=0.1`.
+
+**Recommended Training Method**
+
+It is recommended to adopt the `detach_encoder=True` approach for MTP training.
+
+# 4. Performance Notes for MTP in Rollout Inference
+
+The effectiveness of MTP-accelerated Rollout is significantly affected by **model size** and **inference hardware**. Key reference information is as follows:
+
+**Hardware Tensor Core Performance**
+
+| Hardware Model | FP16 Performance (TFLOPS) |
+|----------------|---------------------------|
+| H20 | 148 |
+| H800 | 1,671 |
+| H200 | 1,979 |
+
+**Measured Performance and Recommendations**
+
+Taking the mimo-7B model deployed separately on H20 hardware using SGLang as an example: After enabling MTP speculative decoding, the Rollout throughput decreases by approximately 50%.
+
+- Current priority recommendation: Do not enable MTP acceleration during the inference phase for now;
+
+- Future planning: Further optimization of the speculative logic in the Rollout phase will be conducted to improve throughput performance.
+
+# 5. SFT training
+
+The SFT training with MTP is supported, using the same MTP training configuration as RL training.
+
+An example configuration for running SFT can be found in `examples/sft/gsm8k/run_mimo_megatron_mtp.sh`
+
+**SFT result**
+
+The experiment was conducted using following data:
+- model = mimo-7B-math
+- dataset = gsm8k
+
+The result: [wandb link](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
+
+The presence of mtp layer has limited effect on main loss. However, when MTP layer is detached, the mtp_loss converges to a higher value.
+
diff --git a/code/RL_model/verl/verl_train/docs/advance/one_step_off.md b/code/RL_model/verl/verl_train/docs/advance/one_step_off.md
new file mode 100644
index 0000000000000000000000000000000000000000..99170d75edc3112b5eba00ab562d8c2316acb9c0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/one_step_off.md
@@ -0,0 +1,319 @@
+# Recipe: One Step Off Policy Async Trainer
+
+**Author:** `https://github.com/meituan-search`
+
+Last updated: 07/17/2025.
+
+## Introduction
+
+### Background
+
+The current reinforcement learning training process implemented by verl is synchronous, adhering to the algorithmic
+workflows of established methods like PPO, GRPO, and DAPO. In each step, training samples are generated by the latest
+model, and the model is updated after training completes. While this approach aligns with off-policy reinforcement
+learning and stabilizes RL training, but it suffers from severe efficiency issues.
+Model updates must wait for the longest output in the generation phase to complete.
+During the generation of long-tail samples, GPUs remain idle, resulting in significant underutilization.
+The more severe the long-tail problem in sample generation, the lower the overall training efficiency.
+For example, in DAPO 32B training, the Rollout phase accounts for approximately 70% of the total time,
+and increasing resources does not reduce the Rollout duration.
+
+
+
+> source data: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=nwusertongyuxuan361
+
+### Solution
+
+We have implemented the **One Step Off Async Trainer** to help alleviate this issue. This approach parallelizes the
+generation and training processes, utilizing samples generated in the previous step for current training.
+It also involves appropriately partitioning resources, allocating dedicated resources for generation while automatically
+assigning the remainder to training. By reducing resources allocated to the generation phase, we mitigate GPU idle time
+during long-tail sample generation. Throughout this process, generation and training parameters maintain a one-step off
+policy.
+
+
+
+> reference: [AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language Reasoning](https://arxiv.org/abs/2505.24298)
+
+Our core contributions include:
+
+1. **Parallel Generation and Training**:
+ Samples for the next batch are asynchronously generated while the current batch is being trained.
+
+2. **Resource Isolation**:
+ Unlike `hybrid_engine`, this method requires explicit resource allocation for rollout, with remaining resources
+ automatically assigned to training.
+
+3. **NCCL Parameter Synchronization**:
+ Employs NCCL communication primitives for seamless parameter transfer between generation and training modules.
+
+### Experimental Results
+
+- **Machine Configuration**: 2 nodes with 16 H20 GPUs each
+ - Generation: 4 GPUs
+ - Training: 12 GPUs
+- **Model**: Qwen2.5-Math-7B
+- **Rollout Configuration**:
+- **Max Response Length**: FSDP2: 20,480 tokens; Megatron: 8,192 tokens
+- **Algorithm**: DAPO
+- **Rollout Engine**: vLLM
+
+| training mode | engine | step | gen | wait_prev_gen | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | acc/maj@32/mean |
+| ---------------------- | ------------- | ---- | --- | ------------- | ------------------ | ------------ | ------------ | -------------- | ---------------- | --------------- |
+| colocate sync | VLLM+FSDP2 | 749 | 321 | - | 247 | 88 | 286 | 19h18m | 0.5948 | 0.417 |
+| one-step-overlap async | VLLM+FSDP2 | 520 | - | 45 | 458 | 108 | 337 | 15h34m(+23%) | 0.6165 | 0.494 |
+| colocate sync | VLLM+Megatron | 699 | 207 | - | 162 | 119 | 344 | 18h21m | 0.605 | 0.4217 |
+| one-step-overlap async | VLLM+Megatron | 566 | - | 59 | 501 | 120 | 347 | 13h06m (+40%) | 0.6569 | 0.4038 |
+
+- colocate sync: step ≈ gen + old_log_prob + update_actor
+- one-step-overlap async: step ≈ wait_prev_gen + old_log_prob + update_actor
+
+
+
+> source data: https://wandb.ai/hou-zg-meituan/one-step-off-policy?nw=nwuserhouzg
+
+## Implementation
+
+### One Step Off Policy Async Pipeline
+
+Our implemented **One Step Off Policy Async Pipeline** integrates seamlessly into existing training logic at minimal
+cost,
+eliminating the need for additional sample storage management. The core mechanism uses `async_gen_next_batch`
+for asynchronous rollout generation while maintaining continuous operation during epoch transitions
+via `create_continuous_iterator`.
+
+```python
+# iterator generator, simplify one-step integration of the training process
+def _create_continuous_iterator(self):
+ for epoch in range(self.config.trainer.total_epochs):
+ iterator = iter(self.train_dataloader)
+ for batch_dict in iterator:
+ yield epoch, batch_dict
+
+
+# read next batch samples, parameters sync and launch asyn gen_seq
+def _async_gen_next_batch(self, continuous_iterator):
+ # read train_data
+ try:
+ epoch, batch_dict = next(continuous_iterator)
+ except StopIteration:
+ return None
+ batch = DataProto.from_single_dict(batch_dict)
+ gen_batch = batch_pocess(batch)
+ # sync weights from actor to rollout
+ self.sync_rollout_weights()
+ # async generation
+ gen_batch_output = self.rollout_wg.async_generate_sequences(gen_batch)
+ # future encapsulated
+ return GenerationBatchFuture(epoch, batch, gen_batch_output)
+
+
+continuous_iterator = self._create_continuous_iterator()
+# run rollout first to achieve one-step-off
+batch_data_future = self._async_gen_next_batch(continuous_iterator)
+
+while batch_data_future is not None:
+ # wait for the gen_seq result from the previous step
+ batch = batch_data_future.get()
+ # launch the next async call to generate sequences
+ batch_data_future = self._async_gen_next_batch(continuous_iterator)
+
+ # compute advantages
+ batch = critic.compute_values(batch)
+ batch = reference.compute_log_prob(batch)
+ batch = reward.compute_reward(batch)
+ batch = compute_advantages(batch)
+
+ # model update
+ critic_metrics = critic.update_critic(batch)
+ actor_metrics = actor.update_actor(batch)
+```
+
+### Parameter Synchronization
+
+The exciting point is that our nccl based weights updating for rollout model has great performance.
+At most of time, the latency is under 300ms, which is negligible for RLHF.
+
+> **sync_rollout_weights**:The time for synchronizing parameters from actor to rollout is extremely fast and can almost
+> be ignored because it is implemented with nccl.
+
+```python
+class ActorRolloutRefWorker:
+ # actor acquires the meta-info of model parameters for parameter sync
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def get_actor_weights_info(self):
+ params = self._get_actor_params()
+ ret = []
+ for key, tensor in params.items():
+ ret.append((key, tensor.size(), tensor.dtype))
+ self._weights_info = ret
+ return ret
+
+ # rollout sets the meta-info of model parameters for parameter sync
+ @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+ def set_actor_weights_info(self, weights_info):
+ self._weights_info = weights_info
+
+
+class AsyncRayPPOTrainer(RayPPOTrainer):
+ def init_workers(self):
+ ...
+ # rollout obtains the meta-info of model parameters from the actor for parameter sync
+ weights_info = self.actor_wg.get_actor_weights_info()[0]
+ self.rollout_wg.set_actor_weights_info(weights_info)
+
+ # Create an actor-rollout communication group for parameter sync
+ self.create_weight_sync_group
+```
+
+```python
+# The driving process invokes the actor and rollout respectively to create a weight synchronization group based on nccl/hccl.
+def create_weight_sync_group(self):
+ master_address = ray.get(self.actor_wg.workers[0]._get_node_ip.remote())
+ master_port = ray.get(self.actor_wg.workers[0]._get_free_port.remote())
+ world_size = len(self.actor_wg.workers + self.rollout_wg.workers)
+ self.actor_wg.create_weight_sync_group(
+ master_address,
+ master_port,
+ 0,
+ world_size,
+ )
+ ray.get(
+ self.rollout_wg.create_weight_sync_group(
+ master_address,
+ master_port,
+ len(self.actor_wg.workers),
+ world_size,
+ )
+ )
+
+# drive process call the actor and rollout respectively to sync parameters by nccl
+def sync_rollout_weights(self):
+ self.actor_wg.sync_rollout_weights()
+ ray.get(self.rollout_wg.sync_rollout_weights())
+
+
+# fsdp model parameter sync
+@register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
+def sync_rollout_weights(self):
+ params = self._get_actor_params() if self._is_actor else None
+ if self._is_rollout:
+ inference_model = (
+ self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+ )
+ from verl.utils.vllm.patch import patch_vllm_moe_model_weight_loader
+ patch_vllm_moe_model_weight_loader(inference_model)
+ # Model parameters are broadcast tensor-by-tensor from actor to rollout
+ for key, shape, dtype in self._weights_info:
+ tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
+ if self._is_actor:
+ assert key in params
+ origin_data = params[key]
+ if hasattr(origin_data, "full_tensor"):
+ origin_data = origin_data.full_tensor()
+ if torch.distributed.get_rank() == 0:
+ tensor.copy_(origin_data)
+ from ray.util.collective import collective
+
+ collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
+ if self._is_rollout:
+ inference_model.load_weights([(key, tensor)])
+```
+
+### PPO Correctness
+
+To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling.
+For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html
+The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored.
+
+### AgentLoop
+
+In the current implementation, we no longer provide SPMD model rollout mode.
+Instead, we have switched to AgentLoop mode, which also supports multi-turn tool calling.
+
+## Usage
+
+### FSDP2 Configuration Example
+
+```shell
+python3 -m verl.experimental.one_step_off_policy.async_main_ppo \
+ --config-path=config \
+ --config-name='one_step_off_ppo_trainer.yaml' \
+ actor_rollout_ref.actor.strategy=fsdp2 \
+ # actor and rollout are placed separately
+ actor_rollout_ref.hybrid_engine=False \
+ # actor and rollout resource
+ trainer.nnodes=1 \
+ trainer.n_gpus_per_node=6 \
+ rollout.nnodes=1 \
+ rollout.n_gpus_per_node=2
+```
+
+### Megatron Configuration Example
+
+```shell
+python3 -m verl.experimental.one_step_off_policy.async_main_ppo \
+ --config-path=config \
+ --config-name='one_step_off_ppo_megatron_trainer.yaml' \
+ actor_rollout_ref.actor.strategy=megatron \
+ # actor and rollout are placed separately
+ actor_rollout_ref.hybrid_engine=False \
+ # actor and rollout resource
+ trainer.nnodes=1 \
+ trainer.n_gpus_per_node=6 \
+ rollout.nnodes=1 \
+ rollout.n_gpus_per_node=2
+```
+
+### Configuration Guidelines
+
+1. **Card Number Relationships**
+ Maintain either of these relationships for optimal batch distribution:
+
+ - `actor_rollout_ref.rollout.n` should be an integer divisor of:
+ `trainer.n_gpus_per_node * trainer.nnodes`
+ - `actor_rollout_ref.rollout.n * data.train_batch_size` should be evenly divisible by:
+ `trainer.n_gpus_per_node * trainer.nnodes`
+
+ > Rationale: Ensures training samples can be evenly distributed across training GPUs when using partial resources for
+ > generation.
+
+2. **Dynamic Resource Tuning**
+ Adjust `trainer.nnodes` `trainer.n_gpus_per_node` `rollout.nnodes` `rollout.n_gpus_per_node` based on phase
+ durations:
+ - **Ideal state**: Rollout and training phases have comparable durations
+ - **Diagnostic metrics**:
+ - Monitor `wait_prev_gen` duration
+ - Analyze `sequence_length` distribution
+ - **Adjustment strategy**:
+ - High `wait_prev_gen` + uniform sequence lengths → Increase rollout resources
+ - High `wait_prev_gen` + long-tail sequences → Optimize stopping criteria (resource increase won't help)
+ > **wait_prev_gen**:The time consumed waiting for the previous rollout to end (the part that is not fully
+ > overlapped).
+ > **Resource Configuration Strategies:**
+ - **Resource-constrained scenario**: Optimize resource utilization by adjusting GPU allocation ratios,
+ keeping the number of nodes equal to allow training and rollout to share nodes;
+ - Configure `trainer.nnodes = rollout.nnodes` with
+ `trainer.n_gpus_per_node + rollout.n_gpus_per_node = physical_gpus_per_node`. Control rollout resource
+ allocation by adjusting `n_gpus_per_node`.
+ - **Resource-abundant scenario**: Optimize performance by adjusting the number of nodes,
+ keeping the number of GPUs per node equal to enable independent scaling of training and rollout
+ parallelism.
+ - Configure `trainer.n_gpus_per_node = rollout.n_gpus_per_node` and control rollout resource allocation by
+ adjusting `trainer.nnodes` and `rollout.nnodes`to achieve optimal performance.
+ > **Note**: The total number of nodes required by the system is not simply `trainer.nnodes + rollout.nnodes`. The
+ > actual calculation depends on GPU capacity:
+ >
+ > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node <= physical_gpus_per_node`,
+ > the required node count is `max(trainer.nnodes, rollout.nnodes)`
+ > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node > physical_gpus_per_node`,
+ > the required node count is `trainer.nnodes + rollout.nnodes`
+
+## Functional Support
+
+| Category | Support Situation |
+| ------------------ | --------------------------------------------------------------------------------------------------------------- |
+| train engine | FSDP2
Megatron |
+| rollout engine | vLLM |
+| AdvantageEstimator | GRPO
GRPO_PASSK
REINFORCE_PLUS_PLUS
RLOO
OPO
REINFORCE_PLUS_PLUS_BASELINE
GPG |
+| Reward | all |
diff --git a/code/RL_model/verl/verl_train/docs/advance/placement.rst b/code/RL_model/verl/verl_train/docs/advance/placement.rst
new file mode 100644
index 0000000000000000000000000000000000000000..43ba761f76d86591d31b447c0ac5140149dd1082
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/placement.rst
@@ -0,0 +1,13 @@
+Ray API Design Tutorial
+=======================================
+
+Last updated: 10/30/2024.
+
+We provide a tutorial for our Ray API design, including:
+
+- Ray basic concepts
+- Resource Pool and RayWorkerGroup
+- Data Dispatch, Execution and Collection
+- Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool
+
+See details in `tutorial.ipynb `_.
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst b/code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5317f9fb15b1664b5e57d1a0daafee5b93365193
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst
@@ -0,0 +1,208 @@
+RL(HF) algorithms with LoRA Support
+===========================================
+
+Last updated: 12/17/2025.
+
+We support LoRA (Low-Rank Adaptation) for reinforcement learning algorithms such as PPO, GRPO, and others.
+
+LoRA is a parameter-efficient fine-tuning technique that injects trainable low-rank matrices into pre-trained weights (typically linear layers). This reduces memory footprint and compute cost, making it possible to fine-tune large models with limited hardware.
+
+The benefits this brings include:
+
+- reinforcement learning with very large models (e.g. 70B+) with modest hardware (e.g. 8x80G GPUs),
+- enable larger batch sizes due to reduced memory usage,
+- simplify model transfer and deployment, as only LoRA adapters need to be saved,
+- Combine with techniques like `SLoRA `_ or `CCoE `_ to serve multiple LoRA adapters efficiently
+
+This guide explains how to enable LoRA in RL training and configure related parameters.
+
+FSDP Backend Usage Guide
+------------------------
+
+.. note::
+
+ This section applies to **FSDP/FSDP2 backend only**. For Megatron backend, see the :ref:`megatron-lora` section below.
+
+1. Lora is available in the `verl.trainer.ppo.ray_trainer.RayPPOTrainer`. Examples are provided via the `verl.trainer.main_ppo` entry point.
+
+2. Currently, LoRA is supported via huggingface peft, only with fsdp/fsdp2 and vllm backend (sglang support coming soon).
+
+- `strategy=fsdp` or `strategy=fsdp2`
+- `rollout.name=vllm`
+
+3. Required configurations for LoRA:
+
+- `actor_rollout_ref.model.lora_rank`: int, set to a reasonable value greater than 0 (e.g., 8, 16, 32, 64)
+- `actor_rollout_ref.model.lora_alpha`: float, the alpha term in LoRA
+- `actor_rollout_ref.rollout.load_format="safetensors"`: required. This enables vLLM to load the base model.
+- `actor_rollout_ref.model.target_modules`: the target modules for LoRA. Typically set to "all-linear".
+
+4. Optional configurations for LoRA:
+
+- `actor_rollout_ref.model.lora_adapter_path`: string, path to a pretrained LoRA adapter directory.
+ If provided, loads existing adapter instead of creating new one. Enables multi-stage training from previously saved adapters.
+ Directory need contain `adapter_model.safetensors` and `adapter_config.json`.
+
+5. Recommend options:
+
+- `actor_rollout_ref.model.use_shm=True`: preload the model into `/dev/shm` to improve model loading speed.
+- `actor_rollout_ref.rollout.layered_summon=True`: this enables the actor-model to gather the FSDP shards per layers when synchronizing the LoRA Adapter to vLLM, thereby reducing GPU peak memory. Recommended if the model is very large (70B+) or the GPU memory is limited (< 48GB)
+
+.. _megatron-lora:
+
+Megatron Backend Usage Guide
+----------------------------
+
+.. warning::
+
+ The FSDP-specific config options are **NOT applicable** to Megatron backend, and they will be ignored if set. Only options listed under ``lora`` key are applicable:
+
+ - ``actor_rollout_ref.model.lora.*``
+ - ``critic.model.lora.*``
+
+You need to install and enable Megatron-Bridge for Megatron LoRA support.
+
+Make sure you use Megatron-Bridge later than 0.2.0, and we recommended using `this commit `_ or later for proper support, and use the following settings to enable Megatron-Bridge:
+
+- ``actor_rollout_ref.actor.megatron.use_mbridge=True``
+- ``actor_rollout_ref.actor.megatron.vanilla_mbridge=False``
+
+**Key Differences from FSDP LoRA:**
+
+1. **LoRA Implementation**: Verl Megatron backend uses Megatron-Bridge's native LoRA implementation, which differs from HuggingFace PEFT.
+
+2. **Weight Sync / Refit Mechanism**: Currently, Megatron-Bridge can support syncing weights by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss), as well as loading separate adapters.
+
+**Configuration for Megatron LoRA:**
+
+.. code-block:: yaml
+
+ actor_rollout_ref:
+ model:
+ lora:
+ # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
+ type: lora
+
+ # whether to sync weights / refit by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss). If this is False, it will load separate adapters.
+ merge: False
+
+ # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
+ rank: 0
+
+ # Weighting factor for the low-rank projection. Defaults to 32
+ alpha: 32
+
+ # Dropout rate for the low-rank projection. Defaults to 0.0
+ dropout: 0.0
+
+ # A list of module names to apply LoRA to.
+ # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
+ # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
+ # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
+ # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
+ # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
+ # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
+ # Target modules can also contain wildcards. For example, you can specify
+ # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
+ #
+ # Note:
+ # For MLA (e.g., DeepSeek), you should use ["linear_kv_down_proj","linear_kv_up_proj","linear_q_down_proj","linear_q_up_proj","linear_q_proj"]
+ # Instead of "linear_qkv" or ["linear_q","linear_k","linear_v"]
+ # By default, MoE routers are excluded from LoRA adaptation, and you will need to specify "router" in target_modules to include them.
+ target_modules:
+ - linear_qkv
+ - linear_proj
+ - linear_fc1
+ - linear_fc2
+
+ # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
+ # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
+ exclude_modules: []
+
+ # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
+ dropout_position: pre
+
+ # Initialization method for the low-rank matrix A. Defaults to "xavier".
+ lora_A_init_method: xavier
+
+ # Initialization method for the low-rank matrix B. Defaults to "zero".
+ lora_B_init_method: zero
+
+ # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
+ a2a_experimental: False
+
+ # Parameter data type for LoRA weights. Default to null, which will use model's dtype.
+ dtype: null
+
+ # Path to pre-trained LoRA adapter weights (null to train from scratch)
+ adapter_path: null
+
+ # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
+ # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
+ # finetune the vision model.
+ freeze_vision_model: True
+ freeze_vision_projection: True
+ freeze_language_model: True
+
+LoRA training experiment with Qwen3-8B on 8 * H200 single node comparing FSDP and Megatron backend (script adapted from examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh):
+
+.. image:: https://github.com/user-attachments/assets/0482f423-01a3-4e52-a7ee-8b9cd79b7b1a
+.. image:: https://github.com/user-attachments/assets/6ce10400-8164-47d8-90a6-c1bf002fb9e8
+.. image:: https://github.com/user-attachments/assets/092d3a43-4eba-425e-a584-8d83c1f02de4
+
+
+Best Practices and Notes
+-------------------------
+
+1. **Learning rate**: it is recommended to increase the value of learning rate by an order of magnitude.
+
+2. **LoRA Rank**:
+
+- Too small a rank can hurt convergence.
+- LoRA rank recommendation from @thelongestusernameofall:
+
+ - A very small lora_rank can lead to slower convergence or worse training performance. It is recommended to set lora_rank to be>=32. Tests have shown that for a 0.5B model, with lora_rank=32,the training convergence speed and final performance are almost identical to non-LoRA training
+ - For a 32B model,with lora_rank=128,the training convergence speed and final performance are also almost identical to non-LoRA training.
+ - More comprehensive reference results are coming soon.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/f2b80b8b26829124dd393b7a795a0640eff11644/docs/lora.jpg?raw=true
+
+3. **FSDP-Specific:** Reference configuration for RL training with the Qwen2.5-72B model using 8 x 80GB GPUs (increase lora_rank if needed):
+
+.. code-block::
+
+ data.train_batch_size=64 \
+ actor_rollout_ref.model.use_shm=True \
+ actor_rollout_ref.model.lora_rank=32 \
+ actor_rollout_ref.model.lora_alpha=32 \
+ actor_rollout_ref.model.target_modules=all-linear \
+ actor_rollout_ref.actor.optim.lr=3e-5 \
+ actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+ actor_rollout_ref.rollout.n=5 \
+ actor_rollout_ref.rollout.max_num_seqs=64 \
+ actor_rollout_ref.rollout.max_model_len=1536 \
+ actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
+ actor_rollout_ref.rollout.load_format=safetensors \
+ actor_rollout_ref.rollout.layered_summon=True \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+
+Example Scripts
+-------------------
+
+For end-to-end examples, refer to the scripts below:
+
+**FSDP Examples:**
+
+- LoRA training from scratch: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
+- LoRA training from adapter path: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh
+
+**Megatron Examples:**
+
+- LoRA training with Dense: examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
+- LoRA training with MoE: examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
diff --git a/code/RL_model/verl/verl_train/docs/advance/reward_loop.rst b/code/RL_model/verl/verl_train/docs/advance/reward_loop.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cb755d9c6044e14f59f1e88d476fa4dd526d3260
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/reward_loop.rst
@@ -0,0 +1,301 @@
+Reward Loop
+===========
+
+.. _yyding: https://yyding1.github.io
+
+Author: `Yuyang Ding `_
+
+Last updated: 12/20/2025.
+
+.. warning::
+ Reward Loop is ready for use, but the API may change in future releases.
+ User can set ``reward_model.use_reward_loop=True`` or ``False`` to control whether to enable reward loop.
+
+Reward Loop is designed to support flexible and user-friendly reward computation, with most implementation in ``verl/experimental/reward_loop``.
+
+Compared with the previous reward mechanism, the Reward Loop offers the following key features:
+
+1. provides a more flexible and user-friendly design for reward-model settings, enabling hybrid reward scenarios where multiple reward sources can be seamlessly integrated.
+2. implements asynchronous reward computation instead of the previous batch-based computation, improving efficiency for both rule-based rewards and reward-model-based scenarios.
+
+Hybrid Reward Scenarios
+-----------------------
+
+Reward Loop covers all typical reward-computation scenarios.
+
+- **Rule-based Reward**: The reward is determined by predefined rules, e.g., checking whether the predicted answer matches the ground truth via simple string matching.
+- **Discriminative Reward Model (DisRM)**: The reward is produced by a specified discriminative reward model, such as ``Skywork/Skywork-Reward-Llama-3.1-8B-v0.2``.
+- **Generative Reward Model (GenRM)**: The reward is obtained using a generative reward model, for example ``dyyyyyyyy/FAPO-GenRM-4B``.
+- **Hybrid Reward Scenarios**: Reward Loop provides interfaces for plugging in reward models, allowing users to define custom reward logic based on their needs (e.g., combining rule-based methods with GenRM).
+
+Rule-based Reward
+~~~~~~~~~~~~~~~~~
+
+If ``custom_reward_function`` is not provided, the reward loop will fall back to the default rule-based reward function.
+Otherwise, only the user-defined reward function will be used. The files under ``verl/utils/reward_score/`` provide some examples.
+
+Reward Loop supports both synchronous and asynchronous user-defined reward functions. It automatically detects the function type and executes it accordingly, ensuring that reward computation remains non-blocking and efficient.
+
+Discriminative Reward Model (DisRM)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For scenarios involving a discriminative reward model, users should provide ``reward_model.model.path`` to specify the reward model.
+
+The Reward Loop will pass the question and the model rollout as inputs to the reward model and obtain a reward score from its output.
+
+Generative Reward Model (GenRM)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For generative reward model scenarios, users need to specify both ``reward_model.model.path`` and ``custom_reward_function``.
+
+The custom reward function should implement the following components:
+
+- Convert the question and the model rollout into a GenRM input prompt using a custom prompt template.
+- Invoke the GenRM to perform generation with custom sampling parameters. For this purpose, the Reward Loop provides an HTTP interface (i.e., ``reward_router_address``) for interacting with GenRM.
+- Parse the GenRM output using a custom parser and extract the reward score.
+
+As these steps are highly customizable and task-dependent, we offer this flexibility entirely to the user-defined reward function.
+
+Below we provide an example of a custom reward function using GenRM.
+
+.. code:: python
+
+ async def compute_score_gsm8k(
+ data_source: str,
+ solution_str: str,
+ ground_truth: str,
+ extra_info: dict,
+ reward_router_address: str, # an HTTP router endpoint provided by Reward Loop
+ reward_model_tokenizer: PreTrainedTokenizer,
+ ):
+ """Compute the reward score."""
+
+ # Step 1: Prepare prompt and request payload
+ grm_prompt = GRM_PROMPT_TEMPLATE.format(problem=extra_info["question"], solution=solution_str)
+ messages = [{"role": "user", "content": grm_prompt}]
+ sampling_params = {"temperature": 0.7, "top_p": 0.8, "max_tokens": 4096}
+ chat_complete_request = {"messages": messages, **sampling_params}
+
+ # Step 2: Send async request to the reward model
+ # here, chat_complete sends async http request to the router address
+ result = await chat_complete(
+ router_address=reward_router_address,
+ chat_complete_request=chat_complete_request,
+ )
+
+ # Step 3: Parse model response and extract score
+ grm_response = result.choices[0].message.content.strip()
+ try:
+ score_str = grm_response.split("\n\n")[-1].strip()
+ score = int(score_str)
+ except Exception:
+ score = 0
+
+ return {"score": score}
+
+Hybrid Reward Scenarios
+~~~~~~~~~~~~~~~~~~~~~~~
+
+For more complex application settings, such as combining rule-based rewards with GenRM, or mixing rule-based rewards with DisRM, users can also achieve this by specifying the ``reward_model.model.path`` together with the ``custom_reward_function``.
+The implementation of the customized reward function follows the same pattern as illustrated above.
+
+A runnable and reproducible example that demonstrates how to use a rule-based reward function together with a GenRM is provided in the ``recipe/fapo`` directory for reference. Welcome to use and cite.
+
+Architecture Design
+-------------------
+
+Reward Loop supports multiple execution modes for reward training:
+
+- **Colocate Mode**: The reward model shares the same resource pool as the actor/rollout/reference models. In this setup, all rollouts must complete first, after which the reward model is awakened to perform inference.
+- **Standalone Mode**: The reward model runs on a separate resource pool, independent from the actor/rollout/reference models. In this setup, each sample is evaluated by the reward model immediately after its rollout finishes.
+
+.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop.svg?raw=true
+
+RewardLoopWorker
+~~~~~~~~~~~~~~~~~
+
+The ``RewardLoopWorker`` is responsible for handling batch-level reward computation, operating in an asynchronous manner.
+
+.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop_worker.svg?raw=true
+
+For each sample, the reward is computed according to the following logic:
+
+- if ``custom_reward_function`` is provided, we directly use user-customized reward function
+- if ``custom_reward_function`` is not provided:
+ - **reward model is not enabled**: use default rule-based reward function
+ - **reward model is discriminative**: compute reward score using disrm
+ - **reward model is generative**: this is not permitted (user-customized reward func **must be** provided)
+
+In most cases, we encourage users to define and use their own customized reward functions.
+
+``RewardLoopWorker`` will initialize a ``RewardManager`` via ``_init_reward_fn()``.
+Then the batch reward computation request of ``compute_score_batch`` will be processed asynchronously.
+
+.. code:: python
+
+ @ray.remote
+ class RewardLoopWorker:
+ def __init__(self, config: DictConfig, reward_router_address: str = None):
+ self.config = config
+ self.reward_router_address = reward_router_address
+ self._init_reward_fn()
+
+ def _init_reward_fn(self):
+ input_tokenizer_local_path = copy_to_local(self.config.actor_rollout_ref.model.path)
+ self.input_tokenizer = hf_tokenizer(input_tokenizer_local_path, trust_remote_code=True)
+ self.reward_model_tokenizer = None
+ if self.config.reward_model.enable:
+ reward_model_tokenizer_local_path = copy_to_local(self.config.reward_model.model.path)
+ self.reward_model_tokenizer = hf_tokenizer(reward_model_tokenizer_local_path, trust_remote_code=True)
+ self.reward_fn = get_custom_reward_fn(self.config)
+ reward_manager_cls = get_reward_manager_cls(self.config.reward_model.reward_manager)
+ self.reward_loop = reward_manager_cls(
+ self.config, self.input_tokenizer, self.reward_fn, self.reward_router_address, self.reward_model_tokenizer
+ )
+
+ async def compute_score_batch(self, data: DataProto) -> list[dict]:
+ tasks = []
+ for i in range(len(data)):
+ tasks.append(asyncio.create_task(self.compute_score(data[i : i + 1])))
+ outputs = await asyncio.gather(*tasks)
+ return outputs
+
+ async def compute_score(self, data: DataProto) -> dict:
+ assert len(data) == 1, "RewardLoopWorker only support single data item"
+ if self.config.custom_reward_function.path is not None:
+ # directly use user-customized reward function
+ return await self.reward_loop.run_single(data)
+ else:
+ if self.config.reward_model.enable:
+ # we assume the rm is disrm
+ # genrm must set custom_reward_function
+ return await self.compute_score_disrm(data)
+ else:
+ return await self.reward_loop.run_single(data)
+
+RewardManager
+~~~~~~~~~~~~~
+
+Reward Loop refactors the previous reward manager, which processed rewards sequentially on batched inputs.
+Instead, the Reward Loop performs reward computation asynchronously and in parallel at the per-sample level.
+
+In the ``RewardManager`` of Reward Loop, we implement a ``run_single`` function to compute the score for single sample. All the reward functions are executed by ``compute_score_fn``. The input should be a ``DataProto`` containing only one item.
+
+.. code:: python
+
+ @register("naive")
+ class NaiveRewardManager(RewardManagerBase):
+ async def run_single(self, data: DataProto) -> dict:
+ assert len(data) == 1, "Only support single data item"
+ ...
+
+Commonly used reward managers, such as ``DAPORewardManager`` has been implemented in reward loop.
+In addition, ``RateLimitRewardManager`` is also ready for use for external API-based reward computation scenarios like ChatGPT.
+
+Users can also customize their own ``RewardManager``, by adding the ``@register`` decorator, inheriting from ``RewardManagerBase``, and implementing the ``run_single`` function.
+See ``verl/experimental/reward_manager/*`` for reference.
+
+.. code:: python
+
+ @register("user_costomized")
+ class UserCostomizedRewardManager(RewardManagerBase):
+ async def run_single(self, data: DataProto) -> dict:
+ assert len(data) == 1, "Only support single data item"
+ # your own reward manager
+ ...
+
+After defining it, users can specify their custom reward manager by setting ``reward_model.reward_manager=user_costomized``.
+
+RewardLoopManager
+~~~~~~~~~~~~~~~~~
+
+To enable parallel reward computation, the Reward Loop launches multiple reward workers that handle reward computation requests concurrently.
+
+In **standalone mode**, we directly launch one ``RewardLoopWorker`` for each ``AgentLoopWorker`` to handle reward computation independently.
+
+In **colocate mode**, we launch a ``RewardLoopManager`` to
+
+1. launch reward model if enabled
+2. manage multiple ``RewardLoopWorker`` instances to parallelize reward computation.
+
+Users can specify the number of workers by setting ``reward_model.num_workers`` in colocate mode.
+
+.. code:: python
+
+ class RewardLoopManager:
+ """
+ RewardLoopManager run in single controller.
+ This class will create reward loop workers and manage them.
+ RewardLoopManager will deprecate fsdp/megatron RewardModelWorker in the future.
+ """
+ def __init__(self, config: DictConfig, rm_resource_pool: RayResourcePool = None):
+ self.config = config
+ if self.config.reward_model.enable:
+ self.reward_model_manager = RewardModelManager(config.reward_model, rm_resource_pool)
+ self.reward_router_address = self.reward_model_manager.get_router_address()
+ else:
+ self.reward_model_manager = None
+ self.reward_router_address = None
+
+ self._init_reward_loop_workers()
+
+ def _init_reward_loop_workers(self):
+ self.reward_loop_workers = []
+ num_workers = self.config.reward_model.get("num_workers", 1)
+ node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0]
+
+ for i in range(num_workers):
+ # Round-robin scheduling over the all nodes
+ node_id = node_ids[i % len(node_ids)]
+ self.reward_loop_workers.append(
+ RewardLoopWorker.options(
+ name=f"reward_loop_worker_{i}",
+ scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+ node_id=node_id,
+ soft=True,
+ ),
+ ).remote(self.config, self.reward_router_address)
+ )
+
+ def compute_rm_score(self, data: DataProto) -> DataProto:
+ """
+ Compute reward score for the given data.
+ """
+ ...
+
+
+RewardModelManager
+~~~~~~~~~~~~~~~~~~
+
+To support flexible and scalable reward model computation, Reward Loop implement a reward router that coordinates requests among multiple reward model servers.
+
+Each reward model runs as an independent server and is registered with the router.
+This router will forward the requests to the registered reward servers with load balancing and return the results.
+This design allows us to expose a single unified router address to user-defined reward functions, enabling them to access various reward models seamlessly through the same interface.
+
+.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop_full.svg?raw=true
+
+.. code:: python
+
+ class RewardModelManager:
+ """Reward model manager."""
+
+ def __init__(
+ self,
+ config: RewardModelConfig,
+ resource_pool: RayResourcePool = None,
+ ):
+ """
+ Initialize the reward model manager.
+
+ Args:
+ config (RewardModelConfig): Reward model configuration.
+ resource_pool (RayResourcePool, optional): Resource pool. Defaults to None.
+ """
+ self.config = config
+ self.resource_pool = resource_pool
+ self._initialize_llm_servers()
+ self._initialize_router()
+ assert self.config.rollout.skip_tokenizer_init is False, "Reward model should not skip tokenizer init."
+ if self.config.rollout.free_cache_engine:
+ self.sleep()
diff --git a/code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst b/code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1839beed3e46805293cc7cdf9836571b4525c7fe
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst
@@ -0,0 +1,61 @@
+RolloutSkip Function Usage Documentation
+========================================
+
+Last updated: 08/01/2025.
+
+Applicable Scenarios
+--------------------
+
+The RolloutSkip functionality is designed to accelerate the rollout process in reinforcement learning training by caching and reusing previously generated sequences. This feature is particularly useful when:
+
+1. You need to repeatedly run experiments with the same configuration
+
+2. You want to save time by avoiding redundant sequence generation to come close to the optimal policy
+
+
+API and Usage Example
+----------------------
+
+2.1 Trainer Adaptation
+~~~~~~~~~~~~~~~~~~~~~~
+
+Both`RayDAPOTrainer()` (in `verl/recipe/dapo/dapo_ray_trainer.py`) and `RayPPOTrainer()`(in `verl/trainer/ppo/ray_trainer.py``) have already been adapted.
+
+This is an example of how to patch rollout_skip in RayPPOTrainer.
+
+.. code-block:: python
+
+ #* Import the RolloutSkip class
+ from verl.utils.rollout_skip import RolloutSkip
+
+ ...
+ class RayPPOTrainer:
+ ...
+ def fit(self):
+ ...
+
+ #* Add code as follow:
+ rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
+ rollout_skip.wrap_generate_sequences()
+
+ ...
+
+ for epoch in range(self.config.trainer.total_epochs):
+ for batch_dict in self.train_dataloader:
+ ...
+
+2.2 Basic Configuration
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Then, you should add the following parameters to your config to enable the RolloutSkip feature:
+
+.. code-block:: bash
+
+ actor_rollout_ref.rollout.skip_rollout=True \
+ actor_rollout_ref.rollout.skip_dump_dir="/tmp/rollout_dump" \
+
+
+Note:
+
+1. The `skip_dump_dir` is the directory where the cached sequences will be stored. Ensure that this directory is writable and accessible by your training process. And make sure that `skip_dump_dir` is not relative path because ray will store the data in `/tmp/ray/session_/` and the relative path will not be found in the worker.
+2. The dumped data path follows this naming pattern `{experiment_name}_{project_name}_TrainGBS{train_gbs}__InferGBS{gen_gbs}__N{n}`, once you change the `experiment_name`, `project_name`, `train_gbs`, `gen_gbs`, or `n`, the cached data will be stored in a new directory.
diff --git a/code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst b/code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5801353cb8c64ed741e0f2ecc54c4d5c0300f260
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst
@@ -0,0 +1,146 @@
+Trace Function Usage Instructions
+========================================
+
+Last updated: 07/10/2025.
+
+Applicable Scenarios
+--------------------
+
+Agentic RL involves multiple turns of conversations, tool invocations, and user interactions during the rollout process. During the Model Training process, it is necessary to track function calls, inputs, and outputs to understand the flow path of data within the application. The Trace feature helps, in complex multi-round conversations, to view the transformation of data during each interaction and the entire process leading to the final output by recording the inputs, outputs, and corresponding timestamps of functions, which is conducive to understanding the details of how the model processes data and optimizing the training results.
+
+The Trace feature integrates commonly used Agent trace tools, including wandb weave and mlflow, which are already supported. Users can choose the appropriate trace tool according to their own needs and preferences. Here, we introduce the usage of each tool.
+
+
+Trace Parameter Configuration
+-----------------------------
+
+- ``actor_rollout_ref.rollout.trace.backend=mlflow|weave`` # the trace backend type
+- ``actor_rollout_ref.rollout.trace.token2text=True`` # To show decoded text in trace view
+- ``actor_rollout_ref.rollout.trace.max_samples_per_step_per_worker=N`` # Limit traces per worker (optional)
+
+Limiting Trace Volume
+~~~~~~~~~~~~~~~~~~~~~~
+
+By default, all samples are traced, which can generate large amounts of data and incur significant costs with trace backends like Weave or MLflow. To limit trace volume while maintaining representative coverage, use ``max_samples_per_step_per_worker``.
+
+Example configuration:
+
+.. code-block:: yaml
+
+ actor_rollout_ref:
+ rollout:
+ trace:
+ backend: weave
+ token2text: False
+ max_samples_per_step_per_worker: 5 # Each worker traces 5 random samples
+
+Each agent loop worker independently selects up to N unique samples to trace per training step. For GRPO (``n > 1``), all rollouts for selected samples are traced. Total traces per step = max_samples_per_step_per_worker * num_workers * n.
+
+Example: With 4 workers, max_samples_per_step_per_worker=5, and GRPO n=4, you get 4 * 5 * 4 = 80 traces per step instead of tracing all samples. Set to null (default) to trace all samples.
+
+
+Glossary
+--------
+
++----------------+------------------------------------------------------------------------------------------------------+
+| Object | Explaination |
++================+======================================================================================================+
+| trajectory | A complete multi-turn conversation includes: |
+| | 1. LLM output at least once |
+| | 2. Tool Call |
++----------------+------------------------------------------------------------------------------------------------------+
+| step | The training step corresponds to the global_steps variable in the trainer |
++----------------+------------------------------------------------------------------------------------------------------+
+| sample_index | The identifier of the sample, defined in the extra_info.index of the dataset. It is usually a number,|
+| | but may also be a uuid in some cases. |
++----------------+------------------------------------------------------------------------------------------------------+
+| rollout_n | In the GROP algorithm, each sample is rolled out n times. rollout_n represents the serial number of |
+| | the rollout. |
++----------------+------------------------------------------------------------------------------------------------------+
+| validate | Whether the test dataset is used for evaluation? |
++----------------+------------------------------------------------------------------------------------------------------+
+
+Rollout trace functions
+-----------------------
+
+There are 2 functions used for tracing:
+
+1. ``rollout_trace_op``: This is a decorator function used to mark the functions to trace. In default, only few method has it, you can add it to more functions to trace more infor.
+2. ``rollout_trace_attr``: This function is used to mark the entry of a trajectory and input some info to trace. If you add new type of agent, you may need to add it to enable trace.
+
+
+Usage of wandb weave
+--------------------
+
+1.1 Basic Configuration
+~~~~~~~~~~~~~~~~~~~~~~~
+
+1. Set the ``WANDB_API_KEY`` environment variable
+2. Configuration Parameters
+
+ 1. ``actor_rollout_ref.rollout.trace.backend=weave``
+ 2. ``trainer.logger=['console', 'wandb']``: This item is optional. Trace and logger are independent functions. When using Weave, it is recommended to also enable the wandb logger to implement both functions in one system.
+ 3. ``trainer.project_name=$project_name``
+ 4. ``trainer.experiment_name=$experiment_name``
+ 5. ``actor_rollout_ref.rollout.mode=async``: Since trace is mainly used for agentic RL, need to enable agent toop using async mode for either vllm or sglang.
+
+Note:
+The Weave Free Plan comes with a default monthly network traffic allowance of 1GB. During the training process, the amount of trace data generated is substantial, reaching dozens of gigabytes per day, so it is necessary to select an appropriate wandb plan.
+
+
+1.2 View Trace Logs
+~~~~~~~~~~~~~~~~~~~
+
+After executing the training, on the project page, you can see the WEAVE sidebar. Click Traces to view it.
+
+Each Trace project corresponds to a trajectory. You can filter and select the trajectories you need to view by step, sample_index, rollout_n, and experiment_name.
+
+After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the input and output content.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_list.png?raw=true
+
+1.3 Compare Trace Logs
+~~~~~~~~~~~~~~~~~~~~~~
+
+Weave can select multiple trace items and then compare the differences among them.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_compare.png?raw=true
+
+Usage of mlflow
+---------------
+
+1. Basic Configuration
+~~~~~~~~~~~~~~~~~~~~~~
+
+1. Set the ``MLFLOW_TRACKING_URI`` environment variable, which can be:
+
+ 1. Http and https URLs corresponding to online services
+ 2. Local files or directories, such as ``sqlite:////tmp/mlruns.db``, indicate that data is stored in ``/tmp/mlruns.db``. When using local files, it is necessary to initialize the file first (e.g., start the UI: ``mlflow ui --backend-store-uri sqlite:////tmp/mlruns.db``) to avoid conflicts when multiple workers create files simultaneously.
+
+2. Configuration Parameters
+
+ 1. ``actor_rollout_ref.rollout.trace.backend=mlflow``
+ 2. ``trainer.logger=['console', 'mlflow']``. This item is optional. Trace and logger are independent functions. When using mlflow, it is recommended to also enable the mlflow logger to implement both functions in one system.
+ 3. ``trainer.project_name=$project_name``
+ 4. ``trainer.experiment_name=$experiment_name``
+
+
+2. View Log
+~~~~~~~~~~~
+
+Since ``trainer.project_name`` corresponds to Experiments in mlflow, in the mlflow view, you need to select the corresponding project name, then click the "Traces" tab to view traces. Among them, ``trainer.experiment_name`` corresponds to the experiment_name of tags, and tags corresponding to step, sample_index, rollout_n, etc., are used for filtering and viewing.
+
+For example, searching for ``"tags.step = '1'"`` can display all trajectories of step 1.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_list.png?raw=true
+
+Opening one of the trajectories allows you to view each function call process within it.
+
+After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the content.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_view.png?raw=true
+
+Note:
+
+1. mlflow does not support comparing multiple traces
+2. rollout_trace can not associate the mlflow trace with the run, so the trace content cannot be seen in the mlflow run logs.
diff --git a/code/RL_model/verl/verl_train/docs/advance/rope.rst b/code/RL_model/verl/verl_train/docs/advance/rope.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9463549e47d055552a273e83a851fc76f93f9d1a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/advance/rope.rst
@@ -0,0 +1,39 @@
+RoPE Scaling override
+=======================================
+
+Last updated: 05/14/2025.
+
+Some models such as `Qwen/Qwen2.5-7B-Instruct `_ support RoPE Scaling but don't have it defined in their config.json file.
+For example, this model supports this configuration:
+
+.. code:: python
+
+ {
+ ...,
+ "rope_scaling": {
+ "factor": 4.0,
+ "original_max_position_embeddings": 32768,
+ "type": "yarn"
+ }
+ }
+
+
+
+In order to support a longer context for such models, you must override the model configs when starting the trainer.
+
+PPO example:
+
+.. code:: bash
+
+ +actor_rollout_ref.model.override_config.rope_scaling.type=yarn \
+ +actor_rollout_ref.model.override_config.rope_scaling.factor=4.0 \
+ +actor_rollout_ref.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
+
+
+And for the critic model
+
+.. code:: bash
+
+ +critic.model.override_config.rope_scaling.type=yarn \
+ +critic.model.override_config.rope_scaling.factor=4.0 \
+ +critic.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
diff --git a/code/RL_model/verl/verl_train/docs/algo/baseline.md b/code/RL_model/verl/verl_train/docs/algo/baseline.md
new file mode 100644
index 0000000000000000000000000000000000000000..ca821865f44f9a3697688d43d80f501d9a771df7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/baseline.md
@@ -0,0 +1,73 @@
+# Algorithm Baselines
+
+Last updated: 06/18/2025.
+
+## Math related datasets
+
+### GSM8k
+
+Assuming GSM8k/math dataset is preprocessed via:
+
+```bash
+python3 examples/data_preprocess/*.py
+```
+
+Refer to the table below to reproduce RL training from different pre-trained checkpoints. Below is the performance on the GSM8k dataset if not specified otherwise. More comprehensive benchmark results areavailable in the recipe folder.
+
+| Hardware | Model | Method | Test score | Details |
+| ---------- | -------------------------------- | --------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NVIDIA GPU | google/gemma-2-2b-it | hf checkpoint | 23.9 | [Huggingface](https://huggingface.co/google/gemma-2-2b-it#benchmark-results) |
+| NVIDIA GPU | google/gemma-2-2b-it | SFT | 52.06 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log) |
+| NVIDIA GPU | google/gemma-2-2b-it | SFT + PPO | 64.02 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log), [wandb](https://api.wandb.ai/links/verl-team/h7ux8602) |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | hf checkpoint | 49.6 | [Qwen blog](https://qwen.ai/blog?id=qwen2.5-llm) |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [command and log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PRIME | 58.7 | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen.sh), [wandb](https://api.wandb.ai/links/zefan-wang-thu-tsinghua-university/rxd1btvb) |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | GRPO-LoRA | 54.3 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.543.log) |
+| NVIDIA GPU | Qwen/Qwen2.5-1.5B-Instruct | GRPO-LoRA | 77.9 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-1.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.779.log) |
+| NVIDIA GPU | Qwen/Qwen2.5-3B-Instruct | GRPO-LoRA | 86.1 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-3B-bsz64_2-prompt512-resp1024-lorarank32-score0.861.log) |
+| NVIDIA GPU | deepseek-ai/deepseek-llm-7b-chat | PPO (Megatron) | 69.5 [1] | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log), [wandb](https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3) |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO | 89 | [script](https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh) |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (FSDP2) | 89.8 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log) |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (Megatron) | 89.6 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b_math_megatron.log) |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | ReMax | 97 | [script](https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh), [wandb](https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln) |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPPO | 65.6 (MATH) | [SPPO script](https://github.com/volcengine/verl-recipe/tree/main/sppo/README.md) |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | GRPO-LoRA | 93.4 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-7B-bsz64_8-prompt512-resp1024-lorarank32-score0.934.log) |
+| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | Instruct model | 83.7 | [Qwen Blog](https://qwen.ai/blog?id=qwen2.5-llm) |
+| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | RLOO (Megatron) | 92.3 | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d) |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPIN | 92 | [script](https://github.com/volcengine/verl-recipe/tree/main/spin/README.md) |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GPG | 88 | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/ab86c4va) |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GPG (Megatron) | 88 | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math_megatron.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/yy8bheu8) |
+| NVIDIA GPU | Qwen/Qwen2.5-VL-7B-Instruct | GRPO (Megatron) | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek) |
+| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | PPO | 70.5 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log) |
+| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | GRPO | 71.4 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log) |
+| NVIDIA GPU | Qwen/Qwen2.5-14B-Instruct | GRPO-LoRA | 94.6 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-14B-bsz64_8-prompt512-resp1024-lorarank32-score0.946.log) |
+| NVIDIA GPU | Qwen/Qwen2.5-32B-Instruct | GRPO-LoRA | 95.8 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-32B-bsz64_8-prompt512-resp1024-lorarank32-score0.958.log) |
+| NVIDIA GPU | Qwen/Qwen2.5-72B-Instruct | GRPO-LoRA | 96.0 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-72B-bs64_8-prompt512-resp1024-lorarank32-score0.960.log) |
+
+### DAPO math-17k
+
+- Training DAPO math-17k dataset: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
+- Testing: AIME'24: https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024
+
+Note:
+
+- For Qwen/Qwen2.5-Math-7B, we directly modify the max_position_embeddings to 32768 without observing performance degradation in order to train longer response length.
+
+| Hardware | Model | Method | Test score | Details |
+| ---------- | -------------------------- | ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NVIDIA GPU | Qwen/Qwen2.5-Math-7B (32k) | DAPO | 36.3 | [command](https://github.com/verl-project/verl-recipe/blob/main//dapo/test_dapo_7b_math.sh), [logs](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361) |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | DAPO + Code Interpreter | 40.0 | [command](https://github.com/verl-project/verl-recipe/blob/main//retool/run_qwen2_7b_dapo.sh) |
+
+## Coding related datasets
+
+Below is the result on leetcode if not specified otherwise.
+
+| Hardware | Model | Method | Test score | Details |
+| ---------- | ----------------------- | ------ | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NVIDIA GPU | PRIME-RL/Eurus-2-7B-SFT | RPIME | 36.1 | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen_code.sh), [swanlab](https://swanlab.cn/@wangzefan/prime_example/runs/7f541qhspgmy8nmhdlx35/chart) |
+
+### Notes
+
+[1] During evaluation, we have only extracted answers following the format `"####"`. A more flexible answer extraction, longer response length, and better prompt engineering may lead to a higher score.
+
+[2] The default value of `actor_rollout_ref.actor.entropy_coeff` is set to `0.0` since verl 0.3.x on 2025-05-30, which is different from previous versions.
diff --git a/code/RL_model/verl/verl_train/docs/algo/collabllm.md b/code/RL_model/verl/verl_train/docs/algo/collabllm.md
new file mode 100644
index 0000000000000000000000000000000000000000..3279e0ff3a43b4154c9ee54ed80452ea997408e0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/collabllm.md
@@ -0,0 +1,105 @@
+# Recipe: CollabLLM
+
+Last updated: 09/22/2025.
+
+> Open-Source Algorithm Implementation & Expriement Running: [Haiquan Chen](https://github.com/chenhaiq), [Shirley Wu](https://github.com/Wuyxin)
+
+🏠 [Homepage](https://aka.ms/CollabLLM) | 📝 [Paper](https://arxiv.org/pdf/2502.00640) | 🤗 [Datasets & Models](https://huggingface.co/collabllm) | ⭐️ [Original Implementation](https://github.com/Wuyxin/collabllm)
+
+`verl` provides a recipe for the Outstanding Paper at ICML 2025, **"CollabLLM: From Passive Responders to Active Collaborators"**. [CollabLLM](https://aka.ms/CollabLLM) is a unified fine-tuning framework that optimizes LLMs for effective and efficient multiturn collaboration with users.
+
+**Core Idea:** Models are rewarded based on how well their responses enable effective *future* collaboration with users.
+
+Paper Authors: [Shirley Wu](https://cs.stanford.edu/~shirwu/), [Michel Galley](https://www.microsoft.com/en-us/research/people/mgalley/), Baolin Peng, Hao Cheng, Gavin Li, Yao Dou, Weixin Cai, [James Zou](https://www.james-zou.com/), [Jure Leskovec](https://cs.stanford.edu/people/jure/), [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/)
+
+
+---
+## Quick Start
+
+### 0. Environment
+Make sure the required packages for `verl` are installed. Additionally, install `litellm` and export the required API keys. The API model will be used for user simulators and, optionally, LLM Judges (see the Configuration section below).
+
+### 1. Prepare Your Dataset
+
+First, process your dataset using the provided script (see example commands and usage in `process_dataset.py`):
+
+```bash
+python process_dataset.py --dataset <> ... --dataset_type
+```
+
+
+**Requirements:**
+- Input: A Hugging Face multiturn dataset. Existing datasets: `collabllm/collabllm-multiturn-$DATASET`, with `DATASET` in one of [`math-hard(-large)`, `medium(-large)`, `bigcodebench(-large)`] (*-large are the datasets used in the CollabLLM paper)
+- Example format: See [collabllm-multiturn-math-hard](https://huggingface.co/datasets/collabllm/collabllm-multiturn-math-hard)
+- To generate your own dataset: Use [build_dataset.py](https://github.com/Wuyxin/collabllm/blob/main/scripts/engine/build_dataset.py) from the original CollabLLM repository
+
+
+### 2. Train Your Model
+
+**(Optional) For Supervised Fine-Tuning (SFT):**
+```bash
+bash train_sft_collabllm.sh
+```
+
+**For Reinforcement Learning (RL):**
+
+```bash
+bash train_rl_collabllm.sh
+```
+
+The RL script shows an example to train CollabLLM on `math-hard-large`.
+
+- The config to sample future conversations are in `recipe/collabllm/config/collabllm_interaction_config.yaml`.
+- The Multiturn-aware Reward is aggregated from these three conversational-level rewards:
+
+ ```
+ +reward_model.reward_kwargs.metric_weights.accuracy=1 \
+ +reward_model.reward_kwargs.metric_weights.interactivity=1 \
+ +reward_model.reward_kwargs.metric_weights.token_amount=-0.0001 \
+ ```
+
+ You can remove, add, or modify the weights depending on your task. A list of implemented metrics you can already add are under `recipe/collabllm/metrics`. For example, on `medium-large`, you can replace `accuracy` with `bleu_score` via
+ ```
+ +reward_model.reward_kwargs.metric_weights.bleu_score=1
+ ```
+ which will instead apply bleu score on the sampled future conversations.
+
+## Algorithm
+
+| Step | Name | Description |
+|------|-------------------------------|-----------------------------------------------------------------------------|
+| 1 | Model response generation | The model generates multiple responses for each prompt in a batch. |
+| 2 | Collaborative simulation | A user simulator (e.g., GPT or Claude) samples `num_repeat_rollouts` conversations for up to `max_user_turns` additional turns. |
+| 3 | Compute Multiturn-aware Reward | Customized conversational reward functions are applied to the sampled conversations. Rewards are aggregated, then averaged across rollouts. |
+| 4 | Update model | The model weights are updated using the computed multiturn-aware rewards. |
+
+---
+
+## Configuration
+
+The primary configuration is managed through the launch script `train_rl_collabllm.sh` and the YAML file `recipe/collabllm/config/collabllm_interaction_config.yaml`. Key configuration sections:
+
+| Section | Key Parameters / Notes |
+|----------------------|-----------------------------------------------------------------------------------------|
+| `data` | Paths to training/validation files, batch sizes, sequence lengths. |
+| `actor_rollout_ref` (common) | Base model path (used for actor + initial reference), FSDP settings, optimization (LR, scheduler). |
+| `actor_rollout_ref` (CollabLLM-specific) | Hyperparameters under `actor_rollout_ref.rollout.multi_turn`: `max_user_turns`, `max_assistant_turns`, `num_repeat_rollouts`. |
+| `interaction` | Defined in `collabllm_interaction_config.yaml`. Specifies user simulator and hyperparameters. Requires exported API keys. |
+| `reward_model` | Manager set to `collabllm` by default. Modify `reward_model.reward_kwargs.metric_weights` for conversational rewards and weights. LLM Judge hyperparameters (e.g., `model`, `temperature`) go under `reward_model.reward_kwargs.llm_judge_kwargs`. |
+| `algorithm` | GRPO-specific hyperparameters such as `actor_rollout_ref.rollout.n`. |
+| `trainer` | Distributed training (nodes, GPUs per node), logging (WandB), checkpointing frequency. |
+
+---
+
+## Key Files
+
+| File Path | Purpose |
+|-----------|---------|
+| `recipe/collabllm/collabllm_agent_loop.py` | Main logic to sample future conversations, using `CollabLLMInteraction` from `verl/interactions/collabllm_interaction.py`. |
+| `verl/workers/reward_manager/collabllm.py` | Computes rewards for future conversations, leveraging `recipe/collabllm/reward_function.py` to apply each metric. |
+
+---
+
+## Acknowledgement
+
+We sincerely thank the `verl` community and advisors for their contributions and guidance!
diff --git a/code/RL_model/verl/verl_train/docs/algo/dapo.md b/code/RL_model/verl/verl_train/docs/algo/dapo.md
new file mode 100644
index 0000000000000000000000000000000000000000..beb1ca5fb98d7dbc59e6044fd8fc34d67fab5da5
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/dapo.md
@@ -0,0 +1,187 @@
+# Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
+
+Last updated: 06/19/2025.
+
+> Open-Source Algorithm Implementation & Expriement Running: [Yuxuan Tong](https://tongyx361.github.io/), [Guangming Sheng](https://hk.linkedin.com/in/guangming-sheng-b50640211)
+
+🏠 [Homepage](https://dapo-sia.github.io/) | 📝 [Paper@arXiv](https://arxiv.org/abs/2503.14476) | 🤗 [Datasets&Models@HF](https://huggingface.co/collections/BytedTsinghua-SIA/dapo-67d7f1517ee33c8aed059da0) | 🐱 [Code@GitHub](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) | 🐱 [Repo@GitHub](https://github.com/BytedTsinghua-SIA/DAPO)
+
+> We propose the **D**ecoupled Clip and Dynamic s**A**mpling **P**olicy **O**ptimization (DAPO) algorithm. By making our work publicly available, we provide the broader research community and society with practical access to scalable reinforcement learning, enabling all to benefit from these advancements. Our system is based on the awesome [verl](https://github.com/volcengine/verl) framework. Thanks for their great work! Applying DAPO training to Qwen2.5-32B base model proves to outperform the previous state-of-the-art DeepSeek-R1-Zero-Qwen-32B on AIME 2024, achieving **50%** accuracy with **50%** less training steps.
+>
+> 
+
+## Quickstart
+
+1. Prepare the datasets **on the Ray cluster**:
+
+```bash
+bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default
+```
+
+2. Submit the job to the Ray cluster **from any machine**:
+
+```bash
+cd verl # Repo root
+export RAY_ADDRESS="http://${RAY_IP:-localhost}:8265" # The Ray cluster address to connect to
+export WORKING_DIR="${PWD}" # The local directory to package to the Ray cluster
+# Set the runtime environment like env vars and pip packages for the Ray cluster in yaml
+export RUNTIME_ENV="./recipe/dapo/runtime_env.yaml" # This sets environment variables for the Ray cluster
+bash recipe/dapo/run_dapo_qwen2.5_32b.sh # or other scripts
+```
+
+## Reproduction Runs
+
+| Setup | AIME 2024 Acc. | Hardware | Image | Commit | Environment Variables | Training Script | Training Record |
+| -------------------------------------------- | -------------- | --------- | -------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
+| DAPO | 52% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
+| DAPO w/o Dynamic Sampling | 50% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_wo_ds_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
+| DAPO w/o Token-level Loss & Dynamic Sampling | 44% | 16x8xH20 | `hiyouga/verl:ngc-th2.5.1-cu120-vllm0.7.4-hotfix` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_early_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_early_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
+
+> [!IMPORTANT]
+>
+> **📢 Call for Contribution!**
+>
+> Welcome to submit your reproduction runs and setups!
+
+## Configuration
+
+### Separated Clip Epsilons (-> Clip-Higher)
+
+An example configuration:
+
+```yaml
+actor_rollout_ref:
+ actor:
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.28
+```
+
+`clip_ratio_low` and `clip_ratio_high` specify the $\varepsilon_{\text {low }}$ and $\varepsilon_{\text {high }}$ in the DAPO objective.
+
+Core relevant code:
+
+```python
+pg_losses1 = -advantages * ratio
+pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
+pg_losses = torch.maximum(pg_losses1, pg_losses2)
+```
+
+### Dynamic Sampling (with Group Filtering)
+
+An example configuration:
+
+```yaml
+data:
+ gen_batch_size: 1536
+ train_batch_size: 512
+algorithm:
+ filter_groups:
+ enable: True
+ metric: acc # score / seq_reward / seq_final_reward / ...
+ max_num_gen_batches: 10 # Non-positive values mean no upper limit
+```
+
+Setting `filter_groups.enable` to `True` will filter out groups whose outputs' `metric` are all the same, e.g., for `acc`, groups whose outputs' accuracies are all 1 or 0.
+
+The trainer will repeat sampling with `gen_batch_size` until there are enough qualified groups for `train_batch_size` or reaching the upper limit specified by `max_num_gen_batches`.
+
+Core relevant code:
+
+```python
+prompt_bsz = self.config.data.train_batch_size
+if num_prompt_in_batch < prompt_bsz:
+ print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
+ num_gen_batches += 1
+ max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
+ if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
+ print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
+ continue
+ else:
+ raise ValueError(
+ f'{num_gen_batches=} >= {max_num_gen_batches=}. Generated too many. Please check your data.'
+ )
+else:
+ # Align the batch
+ traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
+ batch = batch[:traj_bsz]
+```
+
+### Flexible Loss Aggregation Mode (-> Token-level Loss)
+
+An example configuration:
+
+```yaml
+actor_rollout_ref:
+ actor:
+ loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
+ # NOTE: "token-mean" is the default behavior
+```
+
+Setting `loss_agg_mode` to `token-mean` will mean the (policy gradient) loss across all the tokens in all the sequences in a mini-batch.
+
+Core relevant code:
+
+```python
+if loss_agg_mode == "token-mean":
+ loss = verl_F.masked_mean(loss_mat, loss_mask)
+elif loss_agg_mode == "seq-mean-token-sum":
+ seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum
+ loss = torch.mean(seq_losses) # seq-mean
+elif loss_agg_mode == "seq-mean-token-mean":
+ seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1) # token-mean
+ loss = torch.mean(seq_losses) # seq-mean
+else:
+ raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
+```
+
+### Overlong Reward Shaping
+
+An example configuration:
+
+```yaml
+data:
+ max_response_length: 20480 # 16384 + 4096
+reward_model:
+ overlong_buffer:
+ enable: True
+ len: 4096
+ penalty_factor: 1.0
+```
+
+Setting `overlong_buffer.enable` to `True` will penalize the outputs whose lengths are overlong but still within the hard context limit.
+
+Specifically, the penalty increases linearly from `0` to `overlong_buffer.penalty_factor` when the length of the output exceeds the `max_response_length - overlong_buffer.len` by `0` to `overlong_buffer.len` tokens.
+
+Core relevant code:
+
+```python
+if self.overlong_buffer_cfg.enable:
+ overlong_buffer_len = self.overlong_buffer_cfg.len
+ expected_len = self.max_resp_len - overlong_buffer_len
+ exceed_len = valid_response_length - expected_len
+ overlong_penalty_factor = self.overlong_buffer_cfg.penalty_factor
+ overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0)
+ reward += overlong_reward
+```
+
+## FAQ
+
+### Where is the "Overlong Filtering" in the paper?
+
+Most experiments in the paper, including the best-performant one, are run without Overlong Filtering because it's somehow overlapping with Overlong Reward Shaping in terms of properly learning from the longest outputs. So we don't implement it here.
+
+### What's the difference between [the `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl-recipe/tree/main/dapo) and the [`recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo)?
+
+[The `recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) is for **as-is reproduction** and thus won't be updated with new features.
+
+[The `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl-recipe/tree/main/dapo) works as an example of how to extend the latest `verl` to implement an algorithm recipe, which will be maintained with new features.
+
+### Why can't I produce similar results after modifications?
+
+RL infrastructures nowadays still have inherent unrobustness, on which we are still working hard to improve.
+
+We strongly recommend to only modify one thing at a time.
+
+We also list some known problems here:
+
+1. Enabling CUDA graph (`enforce_eager=False`) might cause model performance degradation, whose cause is still under investigation.
diff --git a/code/RL_model/verl/verl_train/docs/algo/entropy.md b/code/RL_model/verl/verl_train/docs/algo/entropy.md
new file mode 100644
index 0000000000000000000000000000000000000000..46153b7e8558583c9d4a0201a1317f09c6c1ecb1
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/entropy.md
@@ -0,0 +1,115 @@
+# Recipe: Entropy Mechanism
+
+Last updated: 06/27/2025.
+
+
+
+
+ The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning.
+
+[](https://arxiv.org/pdf/2505.22617) [](https://github.com/PRIME-RL/Entropy-Mechanism-of-RL) [](https://www.alphaxiv.org/abs/2505.22617) [](https://x.com/stingning/status/1928088554166505667) [](https://x.com/charlesfornlp/status/1928089451080585283) [](https://x.com/_akhaliq/status/1928077929105268861)
+
+
+
+
+
+
+
+## 🎉News
+
+- **[2025/05/29]** 🎉 Ranked **#1** of the day on [Huggingface Daily Papers](https://huggingface.co/papers?date=2025-05-29).
+- **[2025/05/29]** Released our Paper on arXiv. See [here](https://arxiv.org/pdf/2505.22617). We provide insights into the entropy mechanism of RL for LLMs and propose two simple yet effective strategies to alleviate the entropy collapse.
+
+
+
+## ✨Getting started
+
+After preparing the training data, for training Qwen2.5-7B on a single node, taking the KL-Cov approach as an example, you can simply run:
+
+```
+cd verl
+conda activate your_env
+bash recipe/dapo/7b_kl_cov.sh
+```
+
+While for training Qwen2.5-32B on multi nodes, you can run the following commands:
+
+```
+cd verl
+conda activate your_env
+bash recipe/dapo/32b_kl_cov.sh
+```
+
+## 📖Introduction
+
+
+

+
+
+This paper addresses the entropy collapse issue in scaling reinforcement learning (RL) for large language models (LLMs), where policy entropy drops sharply during training, leading to overconfidence and performance saturation. We empirically establish a relationship between entropy ($H$) and performance ($R$): $R=−aexp(H)+b$, showing performance is bottlenecked by entropy exhaustion.
+
+
+

+
+
+Theoretically, we find entropy changes are driven by the covariance between action probability and logit updates, which correlates with advantage in Policy Gradient methods. High-probability, high-advantage actions reduce entropy, while rare, high-advantage actions increase it. Empirically, the covariance term remains positive, explaining entropy’s monotonic decline. To mitigate this, we propose Clip-Cov and KL-Cov, which restrict updates for high-covariance tokens. These methods effectively prevent entropy collapse, and improve performance.
+
+## 📃Evaluation
+
+
+

+
+
+
+Our method is able to maintain a considerably higher level of entropy throughout training. For example, when the baseline's entropy reaches a plateau and can no longer be consumed, the KL-Cov method still sustains an entropy level over 10 times higher. Meanwhile, the response length of the policy model steadily increases, and its performance on the test set consistently surpasses that of the baseline. This indicates that our model is able to explore more freely during training, learning better policy through RL.
+| **Method** | **AIME24** | **AIME25** | **AMC** | **MATH-500** | **OMNI-MATH** | **OlympiadBench** | **Minerva** | **Avg.** |
+| ----------------- | ---------: | ---------: | -------: | -----------: | ------------: | ----------------: | ----------: | -------: |
+| *Qwen2.5-7B* | | | | | | | | |
+| GRPO | 21.2 | 9.6 | 58.7 | 78.8 | 27.9 | 40.7 | 36.7 | 38.6 |
+| w. Clip-higher | 18.1 | 11.5 | 56.6 | 79.2 | 29.8 | 43.3 | 40.4 | 38.8 |
+| w. **`CLIP-Cov`** | 22.1 | **15.8** | 58.2 | 80.4 | **30.5** | **44.1** | **41.1** | 40.4 |
+| w. **`KL-Cov`** | **22.6** | 12.9 | **61.4** | **80.8** | 29.1 | 42.6 | 38.2 | **40.6** |
+| *Qwen2.5-32B* | | | | | | | | |
+| GRPO | 21.8 | 16.2 | 69.7 | 84.2 | 35.2 | 43.6 | 45.5 | 45.8 |
+| w. Clip-higher | 35.6 | 22.3 | 69.5 | 77.2 | 35.1 | 42.5 | 43.0 | 47.2 |
+| w. **`CLIP-Cov`** | 32.3 | 22.7 | 67.2 | **87.0** | **42.0** | **57.2** | 46.0 | 50.3 |
+| w. **`KL-Cov`** | **36.8** | **30.8** | **74.5** | 84.6 | 39.1 | 49.0 | **46.3** | **52.2** |
+
+Our two approaches both achieve non-trivial improvements across all benchmarks. Compared to GRPO, our method outperforms it by 2.0% on average for the 7B model and by 6.4% for the 32B model. Moreover, we observe that our method yields more substantial gains on the larger Qwen2.5-32B. Specifically, our method achieves improvements of 15.0% and 14.6% compared to GRPO on the most challenging benchmarks, AIME24 and AIME25, respectively.
+
+
+## 🎈Citation
+If you find this paper or repo helpful, please cite us.
+
+```bibtex
+@article{cui2025entropy,
+ title={The Entropy Mechanism of Reinforcement Learning for Reasoning Language Models},
+ author={Cui, Ganqu and Zhang, Yuchen and Chen, Jiacheng and Yuan, Lifan and Wang, Zhi and Zuo, Yuxin and Li, Haozhan and Fan, Yuchen and Chen, Huayu and Chen, Weize and others},
+ journal={arXiv preprint arXiv:2505.22617},
+ year={2025}
+}
+```
+## 🌻Acknowledgement
+We implement our reinforcement learning algorithm extending from [verl](https://github.com/volcengine/verl). We utilize [vLLM](https://github.com/vllm-project/vllm) for inference. Our models are trained primarily on [Qwen2.5 family](https://github.com/QwenLM/Qwen2.5). Our training data is built from [DAPO-MATH](https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k). Thanks for their great contributions!
+
+## 📬 Contact
+
+For questions, discussion, or collaboration opportunities, feel free to contact:
+- Ganqu Cui: cuiganqu@pjlab.org.cn
+- Yuchen Zhang: yuchen.zhang2003@gmail.com
+- Jiacheng Chen: jackchan9345@gmail.com
+- Ning Ding: ningding.cs@gmail.com
+
diff --git a/code/RL_model/verl/verl_train/docs/algo/gpg.md b/code/RL_model/verl/verl_train/docs/algo/gpg.md
new file mode 100644
index 0000000000000000000000000000000000000000..36bede8c319040ae713ef335372f2caa40ce44a3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/gpg.md
@@ -0,0 +1,36 @@
+# GPG: Group Policy Gradient
+
+Last updated: 07/03/2025.
+
+Group Policy Gradient (GPG) is a minimalist reinforcement learning (RL) method that enhances the reasoning ability of large language models without relying on supervised fine-tuning or complex tricks. GPG revisits traditional policy gradients and directly optimizes the RL objective—no surrogate losses, no KL penalties, no critic, and no reference model. Compared to GRPO, GPG is simpler, more efficient, and achieves better results on many tasks. For more details, please refer to the original paper [GPG: A Simple and Strong Reinforcement Learning Baseline for Model Reasoning
+](https://arxiv.org/abs/2504.02546).
+
+## Key Components
+- Use a corrected advantage function to improve policy gradient accuracy and training efficiency.
+- By eliminating the critic and reference models, avoiding KL divergence constraints, significantly simplifies the training process compared to Group Relative Policy Optimization (GRPO)
+
+## Configuration
+To configure GPG within the framework, use the following YAML settings.
+
+```yaml
+algorithm:
+ adv_estimator: gpg
+actor_rollout_ref:
+ actor:
+ policy_loss:
+ loss_mode: "gpg"
+```
+
+## Advanced Extensions
+GPG is a simple and strong baseline for model reasoning. Although it avoids using KL loss in its original form, you can still use KL loss to further improve the performance.
+
+```yaml
+algorithm:
+ adv_estimator: gpg
+actor_rollout_ref:
+ actor:
+ use_kl_loss: True # enable kl regularization
+ kl_loss_coef: 0.01
+ policy_loss:
+ loss_mode: "gpg"
+```
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/algo/grpo.md b/code/RL_model/verl/verl_train/docs/algo/grpo.md
new file mode 100644
index 0000000000000000000000000000000000000000..c25f401f9045026d20c8446694702d1f9cbfbc3b
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/grpo.md
@@ -0,0 +1,72 @@
+# Group Relative Policy Optimization (GRPO)
+
+Last updated: 05/31/2025.
+
+In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive.
+
+GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows:
+- Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs.
+- Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality.
+- Baseline Calculation: The average reward of the group serves as a baseline.
+- Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones.
+
+This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300)
+
+## Key Components
+
+- No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic)
+- Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group.
+- Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group.
+
+## Configuration
+
+Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
+
+Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic).
+
+
+
+- `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling.
+
+- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
+
+- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers.
+
+- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor
+
+- `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2
+
+- `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead
+
+- `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead.
+
+Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss:
+
+- `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO.
+
+- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
+
+- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+
+## Advanced Extensions
+
+### DrGRPO
+
+[Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, which leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias.
+
+Configure the following to enable DrGRPO, with all other parameters the same as GRPO's:
+
+- `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging
+- `actor_rollout_ref.actor.loss_scale_factor`: (Optional) Set to a constant integer (e.g., max response length) to ensure consistent normalization throughout training. If not set, uses the current batch's response length.
+- `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO
+- `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm
+
+## Reference Example
+
+Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)
+
+```bash
+bash examples/grpo_trainer/run_qwen3-8b.sh
+```
+
+For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html
diff --git a/code/RL_model/verl/verl_train/docs/algo/opo.md b/code/RL_model/verl/verl_train/docs/algo/opo.md
new file mode 100644
index 0000000000000000000000000000000000000000..338f3a762d9585c608af28cdf4e75837dbfe11e4
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/opo.md
@@ -0,0 +1,33 @@
+# On-Policy RL with Optimal Reward Baseline (OPO)
+
+Last updated: 06/02/2025.
+
+Loose on-policy constraints and suboptimal baselines in reinforcement learning often lead to training instability such as large policy shifts and entropy collapse. OPO addresses these challenges by using exact on-policy training with the theretically optimal reward baseline for advantage estimation. It achieves lower policy shifts and higher output entropy, encouraging more diverse and less repetitive responses.
+
+OPO uses group sampling to generate multiple outputs for each input like GRPO. Unlike group-based algorithms which typically use the mean reward of a group as its baseline, OPO employs a theoretically optimal baseline: the length-weighted reward of the group. It also omits the standard deviation normalization. By adopting these two key components, OPO enables the training of a single policy model with the objective of maximizing only the expected reward. For more detailes, refer to the original paper [On-Policy RL with Optimal Reward Baseline](https://arxiv.org/pdf/2505.23585).
+
+## Key Components
+
+- Exact On-Policy Training: always generates responses from the current policy, without using any pre-generated data or off-policy data.
+- Optimal Reward Baseline: uses a length-weighted reward of the group as the baseline for normalizing the rewards.
+
+## Configuration
+
+To configure OPO within the framework, use the following YAML settings. These parameters are crucial for enabling exact on-policy training and activating the optimal reward baseline.
+
+```yaml
+algorithm:
+ adv_estimator: opo # Use OPO for optimal reward baseline
+data:
+ train_batch_size: 1024
+actor_rollout_ref:
+ actor:
+ ppo_mini_batch_size: 1024 # ppo_mini_batch_size should equal to train_batch_size to enable exact on-policy training
+ entropy_coeff: 0 # disable entropy regularization
+ use_kl_loss: False # disable kl regularization
+ kl_loss_coef: 0
+```
+
+## Advanced Extensions
+
+OPO can also be extended to other algorithms like RLOO and Reinforce++. It just needs to adjust their configurations to enable exact on-policy training and incorporate the optimal length-weighted reward baseline with minimal modifications to their advantage estimation functions.
diff --git a/code/RL_model/verl/verl_train/docs/algo/otb.md b/code/RL_model/verl/verl_train/docs/algo/otb.md
new file mode 100644
index 0000000000000000000000000000000000000000..288eb71bd69cbe38a56b81e1d59b118be4a07a6d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/otb.md
@@ -0,0 +1,104 @@
+# Optimal Token Baseline (OTB)
+
+Last updated: 12/25/2025.
+
+Optimal Token Baseline (OTB) is dynamic token-level baseline for variance reduction. It weights updates based on "Realized Energy"—essentially, how much uncertainty has accumulated up to that specific token. It downweights the noisy parts and trusts the clear signals. Read [Optimal Token Baseline blog](https://richardli.xyz/optimal-token-baseline) for more details.
+
+## The method: OTB
+
+- OTB builds a _dynamic_ baseline that adapts to each token by tracking the “Realized Energy”—the uncertainty that has accumulated up to that token. It downweights the noisy parts and trusts the clear signals.
+- Unlike standard group means (which average over the padding `EOS` token ineffectively), OTB handles this naturally by computing baselines only over valid tokens.
+
+## Logit-Gradient Proxy
+
+- Computing true uncertainty requires expensive backward passes (calculating gradient norms per token). Instead, OTB introduces the **Logit-Gradient Proxy**: the realized energy can be estimated entirely from forward probabilities.
+- This means zero extra backward calls and effectively no additional runtime overhead.
+
+## Mechanics at a glance
+
+For each prompt group of size `N`, OTB computes rewards-to-go `G_t` and cumulative variance weights `W_t`. The optimal baseline per token is
+
+```
+B*_t = (Σ_i G_t^{(i)} · W_t^{(i)}) / (Σ_i W_t^{(i)} + ε),
+W_t = Σ_{j=1}^t (1 - 2π_j + Σπ_j²),
+Σπ_j² = exp(logsumexp(2·logits_j) - 2·logsumexp(logits_j)).
+```
+
+The final advantage is `(G_t - B*_t) · mask_t`, so padding tokens stay at zero.
+
+## Integration in VERL
+
+- `AdvantageEstimator.OPTIMAL_TOKEN_BASELINE` registers `compute_optimal_token_baseline_advantage`, invoked whenever `algorithm.adv_estimator` is set to `optimal_token_baseline`.
+- `ActorRolloutRefWorker.compute_log_prob` emits an additional tensor `sum_pi_squared` (Σπ² per token) when `actor.calculate_sum_pi_squared=True`. This requires disabling fused log-prob kernels, because they do not surface logits.
+- Trainers assert `sum_pi_squared` exists, regroup trajectories by `non_tensor_batch["uid"]`, and run the OTB calculation. If rollout IS is active, they rescale the weights by `rollout_is_weights**2` before aggregating.
+- In Ulysses sequence-parallel setups, the actor gathers, unpads, and returns Σπ² in the same way it handles log-probabilities, so OTB supports sharded sequence-parallel models out of the box.
+- `sum_pi_squared_checkpointing` is available to trade compute for memory when Σπ² tensors become large (e.g., lengthy chain-of-thought reasoning).
+
+## Configuration checklist
+
+- `actor_rollout_ref.actor.calculate_sum_pi_squared: true` (mandatory).
+- `actor_rollout_ref.model.use_fused_kernels: false` (required until fused kernels emit logits).
+- `algorithm.adv_estimator: optimal_token_baseline`.
+- Group sampling (`actor_rollout_ref.rollout.n > 1`) to unlock OTB’s variance reduction; with `n=1` the baseline collapses to returns.
+
+Example OmegaConf overlay:
+
+```yaml
+algorithm:
+ adv_estimator: optimal_token_baseline
+
+actor_rollout_ref:
+ actor:
+ calculate_sum_pi_squared: true
+ sum_pi_squared_checkpointing: false # optional memory saver
+ rollout:
+ n: 8
+```
+
+## Example script
+
+- `examples/otb_trainer/run_qwen2_5-7b.sh`.
+
+## Gradient Variance Proxy Metrics
+
+All gradient-variance analysis in the Optimal Token Baseline work starts from the variance identity
+
+```
+Var(ĝ) = E[||ĝ||²] - ||E[ĝ]||²,
+```
+
+which states that the variance of any stochastic gradient equals the mean squared magnitude minus the squared norm of its expectation.
+
+For a trajectory `τ`, the policy-gradient estimator is
+
+```
+ĝ(τ) = ∇ log π_θ(τ) · A(τ), A(τ) = R(τ) - B.
+```
+
+The logit-gradient proxy approximates the squared gradient norm without an extra backward pass:
+
+```
+||ĝ(τ)||² ≈ Ŵ(τ) · A(τ)²,
+```
+
+where `Ŵ(τ)` is the realized energy built. Given a mini-batch `{τ_i}` of size `N`, we decompose its statistics into three diagnostics:
+
+- **Signal strength (squared norm of the mean gradient)**
+ ```
+ S = || (1/N) · Σ ĝ(τ_i) ||²
+ ```
+- **Total power (signal + noise)**
+ ```
+ P_total = (1/N) · Σ Ŵ(τ_i) · A(τ_i)²
+ ```
+- **Pure noise (estimated variance of the batch mean)**
+ ```
+ Var_proxy = (1/(N-1)) · (P_total - S)
+ ```
+
+`verl/trainer/ppo/metric_utils.py#L306` implements these diagnostics via `compute_variance_proxy_metrics`, emitting
+`variance_proxy/proxy1_signal_strength`,
+`variance_proxy/proxy2_total_power`, and
+`variance_proxy/proxy3_pure_noise`.
+
+Tracking these metrics provides a forward-only, low-overhead view of gradient health for any advantage estimator that supplies `sum_pi_squared`.
diff --git a/code/RL_model/verl/verl_train/docs/algo/ppo.md b/code/RL_model/verl/verl_train/docs/algo/ppo.md
new file mode 100644
index 0000000000000000000000000000000000000000..4740667218579bacf8ab7d1fa5723962c720304c
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/ppo.md
@@ -0,0 +1,105 @@
+# Proximal Policy Optimization (PPO)
+
+Last updated: 06/19/2025.
+
+Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning.
+
+Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from:
+
+- High variance and sample inefficiency.
+- Instability due to large policy updates.
+
+PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives.
+
+For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347).
+
+## Key Components
+
+- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model.
+
+- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias.
+
+- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates.
+
+## Configuration
+
+Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
+
+Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below.
+
+
+
+- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
+
+- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers
+
+- `critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers
+
+- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2
+
+- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor
+
+- `critic.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic. Defaults to `actor_rollout_ref.actor.ppo_epochs`
+
+- `algorithm.gemma`: discount factor
+
+- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator
+
+- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo
+
+## Advanced Extensions
+
+### KL Divergence Control
+
+Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+
+Options to use KL loss for KL divergence control:
+
+- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False
+
+- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
+
+- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+
+Options to use KL penalty in the reward:
+
+- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False.
+
+- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+
+- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
+- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
+- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details.
+- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details.
+
+### Dual-clip PPO
+
+The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound.
+
+
+
+- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0
+
+## Reference Example
+
+Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)
+
+```bash
+bash run_gemma.sh
+ trainer.n_gpus_per_node=1 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ trainer.logger=console \
+ critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+ data.train_batch_size=256 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+ actor_rollout_ref.actor.ppo_micro_batch_size=2 \
+ critic.ppo_micro_batch_size=2
+```
+
+Reference performance with verl v0.2:
+
+| Model | Method | Score | Link |
+|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------|
+| Qwen/Qwen2.5-0.5B-Instruct | pretrained model | 36.4 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
+| Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
diff --git a/code/RL_model/verl/verl_train/docs/algo/rollout_corr.md b/code/RL_model/verl/verl_train/docs/algo/rollout_corr.md
new file mode 100644
index 0000000000000000000000000000000000000000..8569b243a9e2bedd33d02e8f53f39e09d046011a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/rollout_corr.md
@@ -0,0 +1,1313 @@
+# Rollout Correction
+
+**Author:** [Yingru Li](https://richardli.xyz/)
+
+Last updated: 10/30/2025.
+
+---
+
+> **📖 Documentation Structure**
+>
+> - **This document** - Practical usage guide: configurations, presets, troubleshooting
+> - **[Mathematical Formulations](rollout_corr_math.md)** - Theoretical foundations, derivations, and algorithmic details
+>
+> Start here for implementation, refer to the math doc for theory and design rationale.
+
+---
+
+This document provides a comprehensive overview of the Rollout Correction implementation in verl.
+
+**Note on Naming**: This feature is called "Rollout Correction" to reflect the complete functionality: importance sampling (IS) weights and rejection sampling (RS). The internal variable `rollout_is_weights` retains its name as it specifically refers to the IS weights component.
+
+### BibTeX Citation
+
+```bibtex
+@online{liu-li-2025-rl-collapse,
+ title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch},
+ author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu},
+ year = {2025},
+ month = sep,
+ url = {https://richardli.xyz/rl-collapse}
+}
+```
+
+### Blog Series
+
+- Main blog post: https://richardli.xyz/rl-collapse
+- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance)
+- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff)
+- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS)
+
+## Overview
+
+Rollout Correction provides a unified framework to handle **general off-policy problems** in RL training. Any scenario where the data collection distribution differs from the training distribution can benefit from these methods.
+
+**Common off-policy scenarios:**
+
+1. **Policy Mismatch** (Implementation Differences)
+
+ - Different precision: FP8 vs FP16 vs BF16 vs FP32
+ - Different backends: vLLM vs SGLang vs FSDP vs Megatron
+ - Different implementations even with identical weights
+
+2. **Temporal Lag** (Model Staleness)
+
+ - Rollout uses older checkpoint while training has progressed
+ - Asynchronous rollout workers with stale parameters
+ - Common in distributed/async RL systems
+
+3. **Replay Buffers**
+
+ - Training on historical trajectories from earlier iterations
+ - Experience replay from different policy versions
+ - Data augmentation or resampling strategies
+
+4. **Off-Policy Algorithms**
+
+ - Behavioral cloning from expert demonstrations
+ - DAPO (data from auxiliary policies)
+ - Any algorithm using trajectories from a different policy
+
+5. **Data Quality Filtering**
+ - Reweighting or filtering collected data
+ - Preference learning with modified distributions
+ - Curriculum learning with distribution shifts
+
+These off-policy gaps can cause training instability and policy collapse. Rollout Correction uses importance sampling (IS) weights and rejection sampling (RS) to correct for any distribution shift between data collection and training.
+
+**Important Note on Common Implementation Mistakes:**
+
+Many LLM-RL implementations incorrectly apply PPO by **ignoring the actual rollout policy** π_rollout and assuming the training reference policy π_old is the behavior policy. This is mathematically incorrect when π_rollout ≠ π_old (which is typical in LLM-RL due to precision/backend differences between rollout and training).
+
+**This is not PPO's fault** - PPO itself is mathematically correct. The issue is the incorrect assumption that π_old = π_rollout in naive implementations.
+
+This critical implementation mistake that leads to RL training collapse was identified in the blog post ["When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch"](https://richardli.xyz/rl-collapse) and motivated the development of this rollout correction framework.
+
+**Mathematically correct approaches:**
+
+- **Decoupled mode**: Three policies (π*rollout, π_old, π*θ) with IS correction from π_rollout to π_old
+- **Bypass mode**: Two policies (π*rollout = π_old, π*θ) using actual rollout policy as PPO anchor
+- **Bypass + Policy Gradient mode**: Two policies (π*rollout, π*θ) with IS/RS correction and no PPO clipping
+
+See [Mathematical Formulations](rollout_corr_math.md#38-common-implementation-mistake) for detailed explanation.
+
+### Key Design Principle: Separation of IS Weights and Rejection Sampling
+
+The implementation cleanly separates two orthogonal mechanisms:
+
+1. **IS Weights** (`rollout_is_weights`): Continuous reweighting for gradient correction
+
+ - Policy ratio: π*old/π_rollout (decoupled) or π*θ/π_rollout (bypass)
+ - **Safety-bounded**: Clamped to [exp(-20), exp(20)] ≈ [2e-9, 5e8] to prevent overflow
+ - Token level: Bounds per-token ratios
+ - Sequence level: Bounds product of ratios (broadcast to all tokens)
+ - **Truncated**: Upper clamped via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling)
+ - **Zeroed at padding**: Multiplied by response_mask to zero out padding positions
+ - Used to weight policy gradients (variance reduction)
+
+2. **Rejection Sampling** (`modified_response_mask`): Binary filtering for outlier exclusion
+ - Creates binary mask: 1 = keep, 0 = reject
+ - Rejects tokens/sequences with IS ratios outside [lower_threshold, upper_threshold]
+ - Modifies response_mask to exclude rejected samples from training
+ - Used for loss aggregation (rejected samples don't contribute to gradients)
+
+This separation ensures:
+
+- ✅ IS weights provide continuous reweighting (reduce variance)
+- ✅ Rejection sampling provides hard filtering (remove extreme outliers)
+- ✅ Both mechanisms can be enabled independently or together
+- ✅ Safety bounds prevent numerical overflow in all cases
+
+## Quick Start: Using Verified Presets
+
+**NEW**: We now provide typed configuration with verified presets for common scenarios. These presets have been validated with tens of thousands of GPU hours across various models and training scenarios.
+
+### Python API
+
+```python
+from verl.trainer.config.algorithm import RolloutCorrectionConfig
+
+# === Decoupled PPO mode (3 policies: π_rollout, π_old, π_θ) ===
+# IS weights correct for gap between π_old and π_rollout
+config = RolloutCorrectionConfig.decoupled_token_is() # Token-TIS
+config = RolloutCorrectionConfig.decoupled_seq_is() # Seq-TIS
+config = RolloutCorrectionConfig.decoupled_seq_is_rs() # Seq-MIS
+config = RolloutCorrectionConfig.decoupled_geo_rs() # Geo-RS (ratio mode)
+config = RolloutCorrectionConfig.decoupled_geo_rs_token_tis() # Geo-RS + Token-TIS
+
+# === K3 KL Estimator presets (more stable for small KL) ===
+config = RolloutCorrectionConfig.decoupled_k3_rs() # K3-RS only
+config = RolloutCorrectionConfig.decoupled_k3_rs_token_tis() # K3-RS + Token-TIS
+
+# === Bypass PPO mode (2 policies: π_rollout = π_old, π_θ) - fast ===
+# PPO ratio handles IS, so no explicit IS weights needed
+config = RolloutCorrectionConfig.bypass_ppo_clip() # PPO-clip only
+config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs() # PPO-clip + Geo-RS (ratio)
+config = RolloutCorrectionConfig.bypass_ppo_clip_k3_rs() # PPO-clip + K3-RS
+
+# === Bypass PG mode (2 policies, no PPO clipping) - fast ===
+# IS weights computed on-the-fly as π_θ / π_rollout
+config = RolloutCorrectionConfig.bypass_pg_is() # Seq-TIS + PG
+config = RolloutCorrectionConfig.bypass_pg_geo_rs() # Geo-RS + PG (ratio)
+config = RolloutCorrectionConfig.bypass_pg_geo_rs_token_tis() # Geo-RS + Token-TIS + PG
+
+# === Other ===
+config = RolloutCorrectionConfig.disabled() # Metrics only (no correction)
+```
+
+### YAML Configuration (Advanced)
+
+For advanced customization or YAML-based configs:
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: token # IS weights: "token", "sequence", or null
+ rollout_is_threshold: 2.0 # Upper threshold for IS weights
+ rollout_is_batch_normalize: false # Batch normalize IS weights to mean=1.0
+ rollout_rs: null # Rejection sampling: comma-separated canonical options (e.g. "token_k1,seq_max_k2")
+ rollout_rs_threshold: null # Threshold spec: float(s) or "lower_upper" string(s)
+ bypass_mode: false # Skip old_log_prob computation (sets π_old = π_rollout)
+ loss_type: ppo_clip # Loss type in bypass mode: "ppo_clip" (default) or "reinforce"
+
+# REQUIRED: Enable log prob calculation
+actor_rollout_ref:
+ rollout:
+ calculate_log_probs: true
+```
+
+## Files
+
+### **Core Implementation**
+
+- `verl/trainer/ppo/rollout_corr_helper.py` - Contains `compute_rollout_correction_and_rejection_mask()` and `compute_offpolicy_metrics()`
+- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and REINFORCE modes (`compute_policy_loss_bypass_mode()`, `compute_policy_loss_reinforce()`)
+- `verl/trainer/ppo/ray_trainer.py` - Bypass mode implementation (skips `old_log_prob` computation)
+- `verl/workers/actor/dp_actor.py` - Mode selection logic and metrics collection
+
+### **Configuration Files**
+
+- `verl/trainer/config/algorithm.py` - Rollout Correction parameters in `AlgoConfig`
+- `verl/workers/config/actor.py` - Rollout Correction parameters in `ActorConfig`
+- `verl/trainer/config/actor/actor.yaml` - Rollout Correction configuration section
+- `verl/trainer/config/ppo_trainer.yaml` - Algorithm config with Rollout Correction
+
+### **Documentation**
+
+- `docs/examples/config.rst` - Configuration parameter descriptions
+
+### **Example Scripts**
+
+- `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh` - DAPO example with Rollout Correction
+- `examples/rollout_correction/run_with_rollout_corr.sh` - Basic example
+- `examples/rollout_correction/run_with_rollout_corr_multi_rs.sh` - Multi-RS example
+
+### **Tests**
+
+- `tests/trainer/ppo/test_rollout_corr.py` - Unit tests for IS/RS mechanisms
+- `tests/trainer/ppo/test_rollout_corr_integration.py` - Integration tests
+
+## Configuration Parameters
+
+All parameters are under `algorithm.rollout_correction`:
+
+### `rollout_is` (str or null)
+
+Importance sampling weights aggregation level:
+
+- `null` = No IS weights computed (metrics-only mode)
+- `"token"`: Per-token IS weights
+ - **Decoupled mode**: ρ_t = π_old(t)/π_rollout(t)
+ - **Bypass/Pure IS mode**: ρ*t = π*θ(t)/π_rollout(t)
+ - Independent truncation per token
+ - Typical threshold: 1.5 - 5.0
+- `"sequence"`: Per-sequence weight ρ_seq = ∏_t ρ_t
+ - Multiplicative aggregation across sequence
+ - Typical threshold: 2.0 - 10.0
+
+All IS weights are safety-bounded to [exp(-20), exp(20)] ≈ [2e-9, 5e8]
+
+### `rollout_is_threshold` (float)
+
+Upper threshold for IS weight truncation. Default: `2.0`
+
+- Truncates IS weights via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling)
+- Applied to IS weights for variance reduction
+- Separate from rejection sampling (controlled by `rollout_rs` parameters)
+
+### `rollout_is_batch_normalize` (bool)
+
+Apply batch normalization to IS weights. Default: `False`
+
+- `True`: Normalize IS weights to have mean=1.0 within each batch
+ - **Token-level IS**: Normalizes over all token weights
+ - **Sequence-level IS**: Normalizes over sequence means (one weight per sequence)
+- `False`: Use raw (truncated) IS weights
+- Reduces variance by ensuring average weight is 1.0 per batch
+- Applied AFTER truncation to preserve truncation semantics
+- Only affects IS weight values, not rejection sampling
+
+### `rollout_rs` (str or null)
+
+Rejection sampling aggregation modes. Supply a comma-separated string (spaces optional) using the canonical options implemented in `rollout_corr_helper`:
+
+- `token_k1`: Token-level rejection with `-log r` bounds (ratio thresholds supplied as `lower_upper`). Example: `"0.6_1.4"`
+- `token_k2`: Token-level rejection with `0.5 * (log r)^2` (upper bound only)
+- `token_k3`: Token-level rejection with `exp(log r) - 1 - log r` (upper bound only)
+- `seq_sum_k1`: Sequence-level rejection with sum of `-log r` (ratio bounds)
+- `seq_sum_k2`: Sequence-level rejection with sum of `0.5 * (log r)^2` (upper bound only)
+- `seq_sum_k3`: Sequence-level rejection with sum of `exp(log r) - 1 - log r` (upper bound only)
+- `seq_mean_k1`: Sequence-level rejection with mean of `-log r` (ratio bounds)
+- `seq_mean_k2`: Sequence-level rejection with mean of `0.5 * (log r)^2` (upper bound only)
+- `seq_mean_k3`: Sequence-level rejection with mean of `exp(log r) - 1 - log r` (upper bound only)
+- `seq_max_k2`: Sequence-level rejection with max of `0.5 * (log r)^2` (upper bound only)
+- `seq_max_k3`: Sequence-level rejection with max of `exp(log r) - 1 - log r` (upper bound only)
+
+### `rollout_rs_threshold` (str, float, or null)
+
+Threshold specification for rejection sampling.
+
+- Provide **one entry per option**, separated by commas. A single entry is broadcast to every option.
+- **Ratio modes (`*k1`)**: Use `"lower_upper"` strings (e.g. `"0.7_1.3"`). Supplying a float implies only the upper bound; the lower bound defaults to its reciprocal.
+- **Divergence modes (`*k2`/`*k3`)**: Supply positive upper bounds (float or numeric string).
+- Set to `null` to disable thresholds entirely (only valid when `rollout_rs` is null).
+
+## Understanding the Framework: Components and Combinations
+
+The rollout correction framework is built from **orthogonal components** that can be combined flexibly. Understanding these components helps you choose the right configuration for your scenario.
+
+### Key Components
+
+1. **Operating Mode** (Section: [Operation Modes](#operation-modes))
+
+ - **Decoupled**: Three policies (π*rollout, π_old, π*θ) with separate π_old computation
+ - **Bypass**: Two policies (π*rollout = π_old, π*θ), skips π_old computation
+
+2. **Loss Function** (in bypass mode, controlled by `loss_type`)
+
+ - **PPO-clip** (`loss_type="ppo_clip"`, default): PPO clipped objective (IS handled by ratio)
+ - **REINFORCE** (`loss_type="reinforce"`): Policy gradient with explicit IS weights (no clipping)
+
+3. **IS/RS Aggregation Level**
+ - **Token**: Per-token IS weights/rejection
+ - **Sequence**: Sequence-level IS weights/rejection
+
+See [Mathematical Formulations](rollout_corr_math.md#3-algorithmic-components-and-combinations) for detailed theory.
+
+---
+
+## Preset Configuration Guide
+
+This section provides detailed guidance on choosing and using the verified presets. Each preset is a specific combination of components optimized for common scenarios.
+
+### Understanding the Presets
+
+#### Available Preset Methods
+
+| Preset Method | Estimator | Mode | IS Level | RS Level | Properties |
+| ------------------------------------------------------------------------------ | ---------------- | ------------------ | -------- | -------- | --------------------------------------- |
+| **Decoupled PPO Mode** (3 policies: π*rollout, π_old, π*θ) |
+| `decoupled_token_is()` | Token-TIS | Decoupled | token | - | Per-token IS weights |
+| `decoupled_seq_is()` | Seq-TIS | Decoupled | sequence | - | Sequence-level IS weights |
+| `decoupled_seq_is_rs()` | Seq-MIS | Decoupled | sequence | sequence | Sequence IS + sequence RS |
+| `decoupled_geo_rs()` | Geo-RS | Decoupled | - | sequence | Geometric RS (ratio mode) |
+| `decoupled_geo_rs_token_tis()` | Geo-RS-Token-TIS | Decoupled | token | sequence | Geometric filter + token clipped weight |
+| **K3 KL Estimator** (more stable for small KL values) |
+| `decoupled_k3_rs()` | K3-RS | Decoupled | - | k3 | K3 rejection, no IS weights |
+| `decoupled_k3_rs_token_tis()` | K3-RS-Token-TIS | Decoupled | token | k3 | K3 filter + token clipped weight |
+| **Bypass Mode (PPO-clip)** (2 policies; ratio handles IS, RS masks outliers) |
+| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | - | - | PPO-clip only |
+| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | - | sequence | PPO-clip + Geo-RS (ratio) |
+| `bypass_ppo_clip_k3_rs()` | K3-RS | Bypass (PPO-clip) | - | k3 | PPO-clip + K3-RS |
+| **Bypass Mode (REINFORCE)** (2 policies; explicit IS weights, no PPO clipping) |
+| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | sequence | - | REINFORCE with explicit IS |
+| `bypass_pg_geo_rs()` | Geo-RS | Bypass (REINFORCE) | - | sequence | REINFORCE with Geo-RS (ratio) |
+| `bypass_pg_geo_rs_token_tis()` | Geo-RS-Token-TIS | Bypass (REINFORCE) | token | sequence | REINFORCE + Geo filter + token IS |
+| **Other** |
+| `disabled()` | - | - | - | - | Metrics only, no correction |
+
+**Note:**
+
+- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
+ - `"ppo_clip"` (default): PPO clipped objective where ratio = π_θ/π_rollout already handles IS
+ - `"reinforce"`: REINFORCE with explicit IS weights as π_θ / π_rollout
+- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples.
+- Estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS) are compatible with Decoupled and Bypass modes.
+
+#### Other Supported Combinations (Manual Configuration Required)
+
+**Other supported combinations without preset methods:**
+
+- Token IS + Token RS: Token-level IS weights + token-level RS mask
+- Pure token RS: Token-level RS only, no IS weights
+- Pure sequence RS: Sequence-level RS only, no IS weights
+
+See [detailed configuration examples below](#additional-useful-configurations-not-exposed-as-presets) for manual configurations.
+
+**Key properties:**
+
+- Any aggregation level (token/sequence/geometric) works in either decoupled or bypass mode
+- All combinations are fully supported by the implementation
+- Rejection sampling is independent of IS weighting
+- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` (no IS weights)
+
+---
+
+### 1. Decoupled Mode with Token-level Importance Sampling (`decoupled_token_is`)
+
+**Configuration:**
+
+```python
+config = RolloutCorrectionConfig.decoupled_token_is(threshold=2.0)
+```
+
+**Components:**
+
+- **Operating Mode**: Decoupled (3 policies)
+- **Loss**: PPO with clipping (only for the second drift correction)
+- **IS Aggregation**: Token-level
+- **RS**: None (can be added separately)
+
+**Equivalent YAML:**
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: token
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ bypass_mode: false # Decoupled mode
+```
+
+**Properties:**
+
+- Independent truncation per token
+- Lower variance than sequence-level (product of ratios bounded individually)
+- Typical threshold: 1.5 - 5.0
+
+**Theory:** See [rollout_corr_math.md §3.3.1](rollout_corr_math.md#331-token-level-aggregation)
+
+---
+
+### 2. Decoupled Mode with Sequence-level Importance Sampling (`decoupled_seq_is`)
+
+**Also known as: Seq-TIS (Sequence-Level Truncated IS)**
+
+**Configuration:**
+
+```python
+config = RolloutCorrectionConfig.decoupled_seq_is(threshold=2.0)
+```
+
+**Components:**
+
+- **Operating Mode**: Decoupled (3 policies)
+- **Loss**: PPO with clipping (only for the second drift correction)
+- **IS Aggregation**: Sequence-level (Seq-TIS)
+- **RS**: None (can be added separately)
+
+**Equivalent YAML:**
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: sequence
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ bypass_mode: false # Decoupled mode
+```
+
+**Properties:**
+
+- Multiplicative aggregation across sequence
+- More sensitive to outliers than token-level
+- Typical threshold: 2.0 - 10.0 (higher than token-level)
+
+**Theory:** See [rollout_corr_math.md §3.3.2](rollout_corr_math.md#332-sequence-level-aggregation)
+
+---
+
+### 3. Decoupled Mode with Sequence-level IS + Rejection Sampling (`decoupled_seq_is_rs`)
+
+**Also known as: Seq-MIS (Sequence-Level Masked IS)**
+
+**Configuration:**
+
+```python
+config = RolloutCorrectionConfig.decoupled_seq_is_rs(is_threshold=2.0, rs_threshold="0.5_2.0")
+```
+
+**Components:**
+
+- **Operating Mode**: Decoupled (3 policies)
+- **Loss**: PPO with clipping (only for the second drift correction)
+- **IS Aggregation**: Sequence-level (Seq-TIS)
+- **RS**: Sequence-level rejection (Seq-MIS)
+
+**Equivalent YAML:**
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: sequence
+ rollout_is_threshold: 2.0
+ rollout_rs: seq_sum_k1
+ rollout_rs_threshold: 0.5_2.0
+ bypass_mode: false # Decoupled mode
+```
+
+**Properties:**
+
+- Double mechanism: IS reweighting (Seq-TIS) + rejection filtering (Seq-MIS)
+- Lower effective sample size (rejects outliers)
+- For severe off-policy gaps or when the distribution tail is "toxic" (garbage/adversarial samples)
+
+**When to use Seq-MIS over Seq-TIS:**
+
+- **Seq-TIS (clipping only)**: Maximizes information efficiency; extracts signal from all samples. Use when data is clean and mismatch is moderate.
+- **Seq-MIS (rejection)**: Maximizes safety; acts as a hard trust region filter. Use when mismatch is severe or when high-weight samples are likely garbage rather than signal.
+
+**Theory:** See [rollout_corr_math.md §3.4](rollout_corr_math.md#34-rejection-sampling-rs)
+
+---
+
+### 6. Bypass Mode with PPO-clip (`bypass_ppo_clip`)
+
+**Configuration:**
+
+```python
+config = RolloutCorrectionConfig.bypass_ppo_clip()
+```
+
+**Components:**
+
+- **Operating Mode**: Bypass (2 policies: π*rollout = π_old, π*θ)
+- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights)
+- **IS Aggregation**: None (PPO ratio handles it)
+- **RS**: None
+
+**Equivalent YAML:**
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_rs: null
+ bypass_mode: true
+ loss_type: ppo_clip
+```
+
+**Properties:**
+
+- PPO clipped objective in bypass mode
+- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed)
+- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3)
+- No rejection sampling - use `bypass_ppo_clip_geo_rs()` for RS
+
+**Configuration requirement:**
+
+- Set `actor_rollout_ref.rollout.calculate_log_probs: true`
+
+**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies)
+
+---
+
+### 7. REINFORCE with IS (`bypass_pg_is`)
+
+**Configuration:**
+
+```python
+config = RolloutCorrectionConfig.bypass_pg_is(threshold=2.0)
+```
+
+**Components:**
+
+- **Operating Mode**: Bypass (2 policies: π*rollout, π*θ)
+- **Loss**: REINFORCE (policy gradient with explicit IS weights, no PPO clipping)
+- **IS Aggregation**: Sequence-level
+- **RS**: None
+
+**Equivalent YAML:**
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: sequence
+ rollout_is_threshold: 2.0
+ rollout_rs: null
+ bypass_mode: true
+ loss_type: reinforce # REINFORCE with explicit IS weights
+```
+
+**Properties:**
+
+- REINFORCE loss with explicit IS weights (no PPO clipping)
+- Single forward pass (skips old_log_prob computation)
+- IS weights computed on-the-fly in loss function
+
+**Theory:** See [rollout_corr_math.md §3.2.2](rollout_corr_math.md#322-policy-gradient-loss-with-isrs-correction)
+
+---
+
+## Additional Useful Configurations (Not Exposed as Presets)
+
+These configurations are **fully supported** but don't have convenience preset methods yet.
+
+### 1. Token IS + Token RS (`token_is_rs`)
+
+Token-level IS weights with token-level RS mask.
+
+**Python:**
+
+```python
+config = RolloutCorrectionConfig(
+ rollout_is="token",
+ rollout_is_threshold=2.0,
+ rollout_rs="token_k1",
+ rollout_rs_threshold=2.0,
+)
+```
+
+**Properties:** Per-token IS weights + per-token RS mask.
+
+### 2. Pure Token RS (`token_rs`)
+
+Token-level RS only, no IS weights.
+
+**Python:**
+
+```python
+config = RolloutCorrectionConfig(
+ rollout_is=None,
+ rollout_rs="token_k1",
+ rollout_rs_threshold=2.0,
+)
+```
+
+**Properties:** Token-level RS mask, no IS reweighting.
+
+### 3. Pure Sequence RS (`seq_rs`)
+
+Sequence-level RS only, no IS weights.
+
+**Python:**
+
+```python
+config = RolloutCorrectionConfig(
+ rollout_is=None,
+ rollout_rs="seq_sum_k1",
+ rollout_rs_threshold="0.5_2.0",
+)
+```
+
+**Properties:** Sequence-level RS mask, no IS reweighting.
+
+---
+
+### Summary: How IS Weights are Processed
+
+IS weights (`rollout_is_weights`) go through a fixed processing pipeline:
+
+**Stage 1: Safety Bound (Prevent Overflow)**
+
+- Token level: `exp(clamp(log_ratio, -20, 20))` per token → bounds each token to [2e-9, 5e8]
+- Sequence level: `exp(clamp(sum(log_ratio), -20, 20))` → bounds product to [2e-9, 5e8], broadcast to all tokens
+
+**Stage 2: Truncation (Reduce Variance)**
+
+- `.clamp(max=rollout_is_threshold)` → caps weights at upper threshold (TIS: Truncated Importance Sampling)
+- No lower truncation (preserves unbiasedness for small weights)
+
+**Stage 3: Padding Zeroing (Correct Aggregation)**
+
+- `weights * response_mask` → zeros out padding positions
+
+**Stage 4: Optional Batch Normalization**
+
+- If `rollout_is_batch_normalize=True`: Normalize weights to mean=1.0 within batch
+- Applied after truncation to preserve truncation semantics
+
+**Rejection Sampling (Separate Mechanism)**
+
+Rejection sampling modifies `response_mask` (NOT weights) through `compute_rollout_rejection_mask()`:
+
+- Computes safety-bounded ratios independently
+- Creates binary mask: tokens/sequences outside [lower_threshold, upper_threshold] → 0 (rejected)
+- Modified mask used for loss aggregation (rejected samples excluded from training)
+
+## Operation Modes
+
+The framework provides **two operating modes** for computing π_old, which can be combined with different loss functions.
+
+### Operating Modes and Configuration
+
+| Configuration | `bypass_mode` | `loss_type` | Operating Mode | Loss Function | Description |
+| ---------------------- | ------------- | ---------------------- | -------------- | ------------- | ----------------------------------------------------------------- |
+| **Decoupled** | `false` | N/A | Decoupled | PPO | Computes `old_log_prob` separately via `actor.compute_log_prob()` |
+| **Bypass + PPO-clip** | `true` | `"ppo_clip"` (default) | Bypass | PPO-clip | PPO clipped objective (IS handled by ratio) |
+| **Bypass + REINFORCE** | `true` | `"reinforce"` | Bypass | REINFORCE | Policy gradient with explicit IS weights (no PPO clipping) |
+
+### Operating Mode Details
+
+#### Decoupled Mode (Three Policies)
+
+**Policy setup:**
+
+- π_rollout: Behavior policy (data collection)
+- π_old: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch)
+- π_θ: Current policy (being updated)
+
+**Configuration:** `bypass_mode = false`
+
+**Properties:**
+
+- ✅ Achieves batch size invariance
+- ✅ Separately corrects Drift 1 (rollout→old) and Drift 2 (old→current)
+- ✅ Efficient stale data utilization
+- ❌ Extra forward pass needed (`actor.compute_log_prob()`)
+
+**Theory:** See [rollout_corr_math.md §3.1.1](rollout_corr_math.md#311-decoupled-mode-three-policies)
+
+#### Bypass Mode (Two Policies)
+
+**Policy setup:**
+
+- π_rollout: Behavior policy (data collection)
+- π_old = π_rollout: Proximal policy equals behavior policy
+- π_θ: Current policy (being updated)
+
+**Configuration:** `bypass_mode = true`
+
+**Properties:**
+
+- ✅ Skips `actor.compute_log_prob()` call (faster)
+- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS)
+- ✅ Uses two policies instead of three (π_rollout = π_old)
+- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode)
+
+**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies)
+
+---
+
+### IS/RS Aggregation Levels (Orthogonal to Operating Mode)
+
+The aggregation level can be chosen **independently** of the operating mode. Any aggregation level works in either decoupled or bypass mode.
+
+| `rollout_is` | `rollout_rs` | Behavior |
+| ------------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------- |
+| `null` | `null` | **Disabled**: No computation, no metrics, no rejection |
+| `null` | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Rejection only**: Compute metrics, NO weight correction, YES rejection sampling |
+| `"token"` or `"sequence"` | `null` | **IS weights only**: Weight correction enabled, NO rejection sampling |
+| `"token"` or `"sequence"` | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Full correction**: Both weight correction and rejection sampling enabled |
+
+### Key Insights
+
+- ✅ Any IS/RS aggregation level (token/sequence/geometric) can be used in **either** decoupled or bypass mode
+- ✅ You can use **rejection sampling alone** without IS weight correction (`rollout_is=null, rollout_rs="token_k1"`)
+- ✅ You can use **IS weights alone** without outlier rejection (`rollout_is="token", rollout_rs=null`)
+- ✅ You can use **both together** (`rollout_is="token", rollout_rs="token_k1"`)
+- ✅ You can **monitor metrics only** without any correction by setting both to `null` but still providing rollout_log_probs
+
+**Theory:** See [rollout_corr_math.md §3.3](rollout_corr_math.md#33-isrs-aggregation-levels) for details on aggregation levels.
+
+### Example Workflow
+
+**Recommended: Bypass Mode**
+
+This workflow uses bypass mode for efficiency.
+
+1. **Start with metrics only** to understand the off-policy gap:
+
+ ```yaml
+ algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_rs: null
+ bypass_mode: true # Bypass mode (recommended)
+ loss_type: ppo_clip # Default: PPO clipped objective
+ ```
+
+ Monitor `rollout_corr/kl`, `rollout_corr/log_ppl_abs_diff`, `rollout_corr/chi2_token` to assess off-policy gap.
+
+2. **Enable rejection sampling** if you see high outlier fractions:
+
+ ```yaml
+ algorithm:
+ rollout_correction:
+ rollout_is: null
+ rollout_rs: sequence # or "geometric" for higher sensitivity
+ rollout_rs_threshold: 2.0
+ bypass_mode: true # Bypass mode
+ loss_type: ppo_clip # or "reinforce" for explicit IS weights
+ ```
+
+ This excludes outliers from training without modifying gradients.
+
+3. **Enable full IS correction** (with REINFORCE loss) once comfortable with metrics:
+ ```yaml
+ algorithm:
+ rollout_correction:
+ rollout_is: sequence # Recommended: unbiased, suitable for most cases
+ rollout_is_threshold: 2.0
+ rollout_rs: sequence # or "geometric" for more aggressive filtering
+ rollout_rs_threshold: 2.0
+ bypass_mode: true # Bypass mode
+ loss_type: reinforce # REINFORCE with explicit IS weights
+ ```
+
+**Benefits of bypass mode:**
+
+- ✅ Skips expensive `actor.compute_log_prob()` forward pass (faster)
+- ✅ `loss_type` controls the loss function: "ppo_clip" (default) or "reinforce"
+- ✅ PPO-clip: IS handled by ratio (no explicit weights), RS mask applied
+- ✅ REINFORCE: Explicit IS weights computed on-the-fly (π_θ / π_rollout)
+- ✅ Both loss types work with all IS/RS combinations
+
+## Usage
+
+### Basic Setup
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: token # Enable IS weights at token level
+ rollout_is_threshold: 2.0 # Threshold for IS weights
+ rollout_rs: null # No rejection sampling
+
+actor_rollout_ref:
+ rollout:
+ calculate_log_probs: true # Required!
+```
+
+### Metrics
+
+All metrics are prefixed with `rollout_corr/` in logs. For example, `rollout_is_mean` appears as `rollout_corr/rollout_is_mean`.
+
+These metrics cover both:
+
+- **Diagnostic metrics**: KL divergence, perplexity differences (measuring off-policy gap)
+- **Correction statistics**: IS weights, rejection rates (measuring correction applied)
+
+#### **Core IS Weight Metrics**
+
+- **`rollout_is_mean`**: Mean importance sampling weight across all valid tokens
+
+ - Value close to 1.0 indicates minimal off-policy gap
+
+- **`rollout_is_std`**: Standard deviation of IS weights
+
+ - Higher values indicate greater variance in IS weights
+
+- **`rollout_is_min`**: Minimum IS weight observed
+
+ - Shows the most underweighted token/sequence
+ - For sequence/geometric: computed from unclamped log-space ratios (true minimum)
+ - For token: computed from safety-bounded weights
+
+- **`rollout_is_max`**: Maximum IS weight observed
+ - Shows the most overweighted token/sequence
+ - For sequence/geometric: computed from unclamped log-space ratios (true maximum before safety bound)
+ - For token: computed from safety-bounded weights (before threshold clamping)
+ - Compare with `rollout_is_threshold` to see truncation impact
+
+#### **Effective Sample Size**
+
+- **`rollout_is_eff_sample_size`**: Effective sample size after IS weighting
+ - **Formula**: `1 / mean(weights²)` where weights are normalized
+ - **Range**: 0.0 to 1.0 (as fraction of original batch)
+ - Lower values indicate weight concentration on fewer samples
+
+#### **Threshold Exceedance Metrics**
+
+- **`rollout_is_ratio_fraction_high`**: Fraction of weights exceeding upper threshold
+
+ - Shows how often truncation/masking occurs on high end
+ - For sequence/geometric: computed from unclamped log-space ratios (true exceedance)
+ - For token: computed from safety-bounded weights (before threshold clamping)
+
+- **`rollout_is_ratio_fraction_low`**: Fraction of weights below lower threshold (1/upper_threshold)
+ - Diagnostic metric showing how many weights are below the reciprocal threshold
+ - For sequence/geometric: computed from unclamped log-space ratios (true exceedance)
+ - For token: computed from safety-bounded weights (before truncation)
+
+#### **Sequence-Level Metrics** (for sequence aggregation)
+
+- **`rollout_is_seq_mean`**: Mean IS weight at sequence level
+
+ - Should match `rollout_is_mean` for sequence-level aggregation
+
+- **`rollout_is_seq_std`**: Standard deviation of sequence-level IS weights
+
+- **`rollout_is_seq_min`**: Minimum sequence-level IS weight
+
+- **`rollout_is_seq_max`**: Maximum sequence-level IS weight
+
+- **`rollout_is_seq_max_deviation`**: Maximum absolute deviation from 1.0 at sequence level
+
+ - Shows worst-case sequence off-policy gap
+
+- **`rollout_is_seq_fraction_high`**: Fraction of sequences exceeding upper threshold
+
+- **`rollout_is_seq_fraction_low`**: Fraction of sequences below lower threshold
+
+#### **Rejection Sampling Metrics** (when `rollout_rs` is enabled)
+
+- **`rollout_rs_masked_fraction`**: Fraction of tokens rejected via rejection sampling
+
+ - **Important**: Rejection sampling modifies `response_mask` (sets rejected tokens to 0)
+ - **Separate from IS weights**: IS weights are still truncated; rejection is an independent filtering step
+ - Only present when `rollout_rs` is enabled (token/sequence/geometric)
+
+- **`rollout_rs_seq_masked_fraction`**: Fraction of sequences with at least one rejected token
+ - Shows sequence-level impact of rejection sampling
+ - Token-level RS: sequence rejected if ANY token is outside [lower, upper]
+ - Sequence-level RS: entire sequence rejected or accepted based on sequence-level ratio
+ - Geometric RS: entire sequence rejected or accepted based on geometric mean
+
+#### **Off-Policy Diagnostic Metrics** (Training vs Rollout Policy)
+
+**Note on terminology:** These metrics use "training" to refer to the training reference policy and "rollout" to refer to π_rollout (the behavior policy used for data collection).
+
+- **Decoupled mode**: "training" = π_old (computed at start of training epoch)
+- **Bypass/Pure IS mode**: "training" = π_θ (current policy being trained)
+
+In bypass/pure IS mode, metrics measure the drift between π_θ and π_rollout directly.
+
+- **`training_ppl`**: Perplexity of training reference policy (π*old in decoupled mode, π*θ in bypass/pure IS mode)
+
+ - **Formula**: `exp(-mean(log_probs))`
+ - Lower values indicate higher model confidence
+
+- **`rollout_ppl`**: Perplexity of rollout policy π_rollout (e.g., vLLM BF16)
+
+- **`ppl_ratio`**: Ratio of training PPL to rollout PPL
+
+ - **Formula**: `exp(mean(log(training_ppl / rollout_ppl)))`
+ - **Meaning**: > 1.0 means training is less confident than rollout
+
+- **`training_log_ppl`**: Log perplexity of training policy
+
+ - Useful for identifying trends (linear scale)
+
+- **`rollout_log_ppl`**: Log perplexity of rollout policy
+
+- **`log_ppl_diff`**: Mean difference in log perplexities
+
+ - **Formula**: `mean(log_ppl_rollout - log_ppl_training)`
+ - Sign indicates which policy is more confident
+
+- **`log_ppl_abs_diff`**: Mean absolute log perplexity difference
+
+ - Magnitude of off-policy gap regardless of direction
+
+- **`log_ppl_diff_max`**: Maximum log perplexity difference across sequences
+
+ - Identifies worst-case sequence
+
+- **`log_ppl_diff_min`**: Minimum log perplexity difference across sequences
+
+- **`kl`**: KL divergence KL(π_rollout || π_training)
+
+ - **Formula**: `mean(log_prob_rollout - log_prob_training)`
+ - **Note**: Can be negative (rollout is less confident)
+
+- **`k3_kl`**: K3 divergence (equals KL(π_rollout || π_training) in expectation)
+
+ - **Formula**: `mean(exp(log_ratio) - log_ratio - 1)`
+ - More stable than direct KL (non-negative per token)
+ - Always >= 0
+
+- **`chi2_token`**: Chi-squared divergence at token level
+
+ - **Formula**: `mean(ratio²) - 1` where ratio = π_training/π_rollout
+ - Measures second moment of IS weight distribution
+ - Always non-negative
+
+- **`chi2_seq`**: Chi-squared divergence at sequence level
+ - **Formula**: `mean((∏_t ratio_t)²) - 1`
+ - Sequence-level second moment of IS weights
+ - More sensitive than token-level chi-squared
+
+#### **Example: Accessing Metrics in Code**
+
+```python
+# Metrics are returned from compute_rollout_correction_and_rejection_mask
+from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask
+
+# Returns 3 values (weights, modified_response_mask, metrics)
+weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+ old_log_prob=training_log_probs, # from training policy
+ rollout_log_prob=rollout_log_probs, # from rollout policy
+ response_mask=response_mask,
+ rollout_is="token", # Enable IS weights at token level
+ rollout_is_threshold=2.0,
+ rollout_rs="token_k1",
+ rollout_rs_threshold="0.5_2.0",
+)
+
+# Extract IS weights (processed, zeroed at padding)
+is_weights = weights_proto.batch["rollout_is_weights"]
+
+# IS weights processing (with IS enabled at token level):
+# 1. Safety-bounded: exp(clamp(log_ratio, -20, 20)) per token
+# 2. Truncated: .clamp(max=2.0) to cap extreme weights
+# 3. Zeroed at padding positions
+# Note: Truncation is ALWAYS applied to IS weights (TIS: Truncated Importance Sampling)
+
+# modified_response_mask has rejection applied (since rollout_rs="token_k1"):
+# 1. RS rejection: tokens outside [0.5, 2.0] masked to 0 via response_mask
+# Note: RS and IS are separate mechanisms - both can be enabled independently
+
+# All metrics have 'rollout_corr/' prefix
+print(f"Mean IS weight: {metrics['rollout_corr/rollout_is_mean']:.3f}")
+print(f"Effective sample size: {metrics['rollout_corr/rollout_is_eff_sample_size']:.3f}")
+print(f"RS masked fraction: {metrics['rollout_corr/rollout_rs_masked_fraction']:.3f}")
+print(f"KL divergence: {metrics['rollout_corr/kl']:.3f}")
+
+# Check IS weights for valid tokens (non-padding)
+valid_weights = is_weights[response_mask.bool()]
+print(f"\n✓ IS weights min (valid tokens): {valid_weights.min():.4f}")
+print(f"✓ IS weights max (valid tokens): {valid_weights.max():.4f}")
+print(f"✓ All valid IS weights > 0: {(valid_weights > 0).all()}")
+print(f"✓ IS weights are capped at threshold: {(valid_weights <= 2.0).all()}")
+
+# Check rejection via response_mask
+rejected_tokens = (response_mask == 1) & (modified_response_mask == 0)
+print(f"\n✓ Rejected {rejected_tokens.sum()} tokens via response_mask")
+print(f"✓ Rejection sampling modifies response_mask (separate from IS weight truncation)")
+print(f"✓ IS weights are always truncated to [0, threshold] after safety bounding")
+
+# Check for warning conditions
+if metrics['rollout_corr/rollout_is_mean'] < 0.5 or metrics['rollout_corr/rollout_is_mean'] > 2.0:
+ print("⚠️ Warning: Mean IS weight far from 1.0, significant off-policy gap detected")
+
+if metrics['rollout_corr/rollout_is_eff_sample_size'] < 0.3:
+ print("⚠️ Warning: Low effective sample size, high weight concentration")
+```
+
+#### **Example: Monitoring Metrics During Training**
+
+```python
+# In your training loop
+for epoch in range(num_epochs):
+ for batch_idx, batch in enumerate(dataloader):
+ # ... rollout phase ...
+
+ # Compute IS weights and get metrics
+ rollout_corr_config = config.algorithm.get("rollout_correction", None)
+ if rollout_corr_config is not None:
+ weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+ old_log_prob=batch.old_log_prob,
+ rollout_log_prob=batch.rollout_log_prob,
+ response_mask=batch.response_mask,
+ rollout_is=rollout_corr_config.get("rollout_is", None),
+ rollout_is_threshold=rollout_corr_config.get("rollout_is_threshold", 2.0),
+ rollout_rs=rollout_corr_config.get("rollout_rs", None),
+ rollout_rs_threshold=rollout_corr_config.get("rollout_rs_threshold", None),
+ )
+
+ # Log to tensorboard/wandb
+ for metric_name, metric_value in metrics.items():
+ logger.log_scalar(metric_name, metric_value, step=global_step)
+
+ # IMPORTANT: Update batch response_mask with rejection applied
+ batch.response_mask = modified_response_mask
+
+ # Use IS weights in training (always safety-bounded, zeroed at padding)
+ is_weights = weights_proto.batch["rollout_is_weights"]
+ # ... apply weights to policy gradient ...
+```
+
+#### **Example: Conditional Alerting Based on Metrics**
+
+```python
+def check_rollout_correction_health(metrics, config):
+ """Check if Rollout Correction metrics indicate healthy training."""
+ warnings = []
+
+ # Check mean IS weight
+ mean_weight = metrics['rollout_corr/rollout_is_mean']
+ if mean_weight < 0.5 or mean_weight > 2.0:
+ warnings.append(f"Mean IS weight {mean_weight:.3f} is far from 1.0")
+
+ # Check effective sample size
+ ess = metrics['rollout_corr/rollout_is_eff_sample_size']
+ if ess < 0.3:
+ warnings.append(f"Effective sample size {ess:.3f} is too low")
+
+ # Check standard deviation
+ std = metrics['rollout_corr/rollout_is_std']
+ if std > 1.0:
+ warnings.append(f"IS weight std {std:.3f} is too high")
+
+ # Check KL divergence
+ kl = metrics['rollout_corr/kl']
+ if abs(kl) > 0.1:
+ warnings.append(f"KL divergence {kl:.3f} indicates significant off-policy gap")
+
+ # Check chi-squared divergence
+ if 'rollout_corr/chi2_token' in metrics:
+ chi2_token = metrics['rollout_corr/chi2_token']
+ if chi2_token > 1.0:
+ warnings.append(f"Chi-squared divergence (token) {chi2_token:.3f} indicates severe distribution shift")
+
+ if warnings:
+ print("⚠️ Rollout Correction Health Warnings:")
+ for warning in warnings:
+ print(f" - {warning}")
+ return False
+ else:
+ print("✅ Rollout Correction metrics look healthy")
+ return True
+
+# Use in training
+_, _, metrics = compute_rollout_correction_and_rejection_mask(...)
+is_healthy = check_rollout_correction_health(metrics, config)
+
+if not is_healthy:
+ # Consider adjusting config or investigating issues
+ print("Consider:")
+ print(" - Tightening rollout_is_threshold")
+ print(" - Switching to geometric aggregation level")
+ print(" - Checking if rollout and training policies are too different")
+```
+
+### Running Examples
+
+Start with the basic token-level truncate configuration:
+
+```bash
+bash examples/rollout_correction/run_with_rollout_corr.sh
+```
+
+Monitor metrics for 1-2 epochs before adjusting parameters.
+
+## Configuration Examples
+
+### Example 1: IS Weights Only (Token Level)
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: token
+ rollout_is_threshold: 2.0
+ rollout_rs: null # No rejection sampling
+```
+
+### Example 2: Rejection Sampling Only (No IS Weights)
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: null # No IS weights
+ rollout_rs: token_k1
+ rollout_rs_threshold: "0.5_2.0"
+```
+
+### Example 3: Both IS and RS (Token RS)
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: token
+ rollout_is_threshold: 2.0
+ rollout_rs: token_k1
+ rollout_rs_threshold: "0.5_2.0"
+```
+
+### Example 5: Bypass Mode with PPO-clip (Default)
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: token
+ rollout_is_threshold: 2.0
+ rollout_rs: token_k1
+ rollout_rs_threshold: "0.5_2.0"
+ bypass_mode: true # Skip old_log_prob computation
+ loss_type: ppo_clip # PPO clipped objective (default)
+```
+
+**Skips expensive `actor.compute_log_prob()` forward pass. PPO ratio = π_θ/π_rollout handles IS.**
+
+### Example 6: Bypass Mode with REINFORCE
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: sequence # Explicit IS correction in loss
+ rollout_is_threshold: 2.0
+ rollout_rs: null # Optional: can add rejection sampling
+ bypass_mode: true
+ loss_type: reinforce # REINFORCE with explicit IS weights
+```
+
+**No PPO clipping, pure policy gradient with IS correction**
+
+### Example 7: Bypass Mode with PPO-clip + Rejection Sampling
+
+```yaml
+algorithm:
+ rollout_correction:
+ rollout_is: sequence # Computed for metrics
+ rollout_is_threshold: 2.0
+ rollout_rs: seq_max_k2 # Sequence max χ²/2 guard
+ rollout_rs_threshold: 2.5
+ bypass_mode: true
+ loss_type: ppo_clip # PPO clipped objective (IS handled by ratio)
+```
+
+**PPO clipping with rejection sampling. IS handled by PPO ratio (no explicit IS weights).**
+
+## Troubleshooting
+
+### Issue: High spread in IS weights
+
+**Symptoms:** `rollout_is_std` > 1.0, `rollout_is_eff_sample_size` < 0.3
+
+**Solutions:**
+
+1. Switch from `sequence` to `geometric` level
+2. Tighten thresholds
+3. Verify rollout and training aren't too different
+
+### Issue: Mean IS weight far from 1.0
+
+**Symptoms:** `rollout_is_mean` < 0.5 or > 2.0
+
+**Solutions:**
+
+1. Verify `calculate_log_probs=True` is set
+2. Check rollout_log_probs are correctly passed
+3. Check for systematic distribution shift
+
+### Debugging: Visualizing Metrics
+
+**Example: Plot IS weight distribution**
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+
+def plot_is_metrics(metrics_history):
+ """Plot rollout IS metrics over training steps."""
+ fig, axes = plt.subplots(2, 3, figsize=(15, 10))
+
+ # Plot 1: Mean IS weight over time
+ axes[0, 0].plot(metrics_history['rollout_corr/rollout_is_mean'])
+ axes[0, 0].axhline(y=1.0, color='r', linestyle='--', label='Ideal')
+ axes[0, 0].set_title('Mean IS Weight')
+ axes[0, 0].set_xlabel('Step')
+ axes[0, 0].legend()
+
+ # Plot 2: Effective sample size
+ axes[0, 1].plot(metrics_history['rollout_corr/rollout_is_eff_sample_size'])
+ axes[0, 1].axhline(y=0.5, color='g', linestyle='--', label='Good')
+ axes[0, 1].axhline(y=0.3, color='r', linestyle='--', label='Warning')
+ axes[0, 1].set_title('Effective Sample Size')
+ axes[0, 1].set_xlabel('Step')
+ axes[0, 1].legend()
+
+ # Plot 3: KL divergence over time
+ axes[1, 0].plot(metrics_history['rollout_corr/kl'], label='KL')
+ axes[1, 0].plot(metrics_history['rollout_corr/k3_kl'], label='K3 KL')
+ axes[1, 0].axhline(y=0, color='g', linestyle='--', alpha=0.3)
+ axes[1, 0].set_title('KL Divergence')
+ axes[1, 0].set_xlabel('Step')
+ axes[1, 0].legend()
+
+ # Plot 4: PPL ratio over time
+ axes[1, 1].plot(metrics_history['rollout_corr/ppl_ratio'])
+ axes[1, 1].axhline(y=1.0, color='r', linestyle='--', label='Ideal')
+ axes[1, 1].set_title('PPL Ratio (Training/Rollout)')
+ axes[1, 1].set_xlabel('Step')
+ axes[1, 1].legend()
+
+ # Plot 5: Chi-squared divergence
+ if 'rollout_corr/chi2_token' in metrics_history:
+ axes[1, 2].plot(metrics_history['rollout_corr/chi2_token'], label='Token-level')
+ if 'rollout_corr/chi2_seq' in metrics_history:
+ axes[1, 2].plot(metrics_history['rollout_corr/chi2_seq'], label='Seq-level')
+ axes[1, 2].axhline(y=1.0, color='r', linestyle='--', label='Warning')
+ axes[1, 2].set_title('Chi-squared Divergence')
+ axes[1, 2].set_xlabel('Step')
+ axes[1, 2].legend()
+ else:
+ axes[1, 2].axis('off')
+
+ plt.tight_layout()
+ plt.savefig('rollout_is_metrics.png', dpi=150)
+ print("Saved plot to rollout_is_metrics.png")
+```
+
+**Example: Metric collection during training**
+
+```python
+# Collect metrics over time
+metrics_history = {
+ 'rollout_corr/rollout_is_mean': [],
+ 'rollout_corr/rollout_is_eff_sample_size': [],
+ 'rollout_corr/kl': [],
+ 'rollout_corr/k3_kl': [],
+ 'rollout_corr/ppl_ratio': [],
+ 'rollout_corr/chi2_token': [],
+ 'rollout_corr/chi2_seq': [],
+}
+
+# In training loop
+for step in range(num_steps):
+ # ... compute IS weights and rejection mask ...
+ _, _, metrics = compute_rollout_correction_and_rejection_mask(...)
+
+ # Store metrics
+ for key in metrics_history.keys():
+ if key in metrics:
+ metrics_history[key].append(metrics[key])
+
+ # Plot every 100 steps
+ if step % 100 == 0:
+ plot_is_metrics(metrics_history)
+```
+
+## Performance Impact
+
+- **Memory overhead**: ~1% of model memory
+- **Computational overhead**: 1-3% depending on level
+- **Training stability**: Significantly improved when off-policy gap exists
+
+## Testing
+
+Run the test suite to verify everything works:
+
+```bash
+# Basic unit tests
+python test_rollout_corr.py
+
+# Integration tests (if pytest is available)
+pytest tests/trainer/ppo/test_rollout_corr_integration.py -v
+```
+
+Expected output: All tests pass ✓
+
+## Additional Resources
+
+- **Implementation**: `verl/trainer/ppo/rollout_corr_helper.py`
+- **Examples**: `examples/rollout_correction/`
+- **DAPO Example**: `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh`
+
+## Summary
+
+Rollout Correction provides a unified framework for handling general off-policy problems in RL:
+
+- ✅ Corrects ANY distribution shift between data collection and training
+- ✅ Supports diverse scenarios: policy mismatch, staleness, replay buffers, off-policy algorithms
+- ✅ Numerical stability with safety bounds and rejection mechanisms
+- ✅ Comprehensive diagnostics: KL, perplexity, χ² divergence
+- ✅ Flexible methods from token-level to sequence-level aggregation
+- ✅ Memory-efficient implementation
+
+## References
+
+- **[Mathematical Formulations](rollout_corr_math.md)** - Detailed mathematical theory and derivations for all rollout correction methods
+- [When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch](https://richardli.xyz/rl-collapse) (see Blog Series above for parts 1-3)
+- [Your Efficient RL Framework Secretly Brings You Off-Policy RL Training](https://fengyao.notion.site/off-policy-rl)
diff --git a/code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md b/code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0b0c13a29c072c179f89e23d2539cc06a8a52b1
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md
@@ -0,0 +1,954 @@
+# Mathematical Formulations of Rollout Correction Methods in `verl`
+
+**Author:** [Yingru Li](https://richardli.xyz)
+**Last updated:** 2025-11-04
+
+---
+
+> **📖 Documentation Structure**
+> - **This document** - Mathematical theory: formulations, derivations, and algorithmic foundations
+> - **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical implementation: configurations, presets, troubleshooting
+>
+> Start here for theory and design rationale, refer to the usage guide for implementation.
+
+---
+
+### BibTeX Citation
+
+```bibtex
+@online{liu-li-2025-rl-collapse,
+ title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch},
+ author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu},
+ year = {2025},
+ month = sep,
+ url = {https://richardli.xyz/rl-collapse}
+}
+```
+
+### Blog Series
+
+- Main blog post: https://richardli.xyz/rl-collapse
+- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance)
+- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff)
+- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS)
+
+## Abstract
+
+This document provides the definitive mathematical formulations for rollout correction methods in `verl`, following the natural progression from **REINFORCE** to **PPO** to **Decoupled PPO**.
+
+Rollout correction provides a unified framework to handle **general off-policy problems** in RL training - any scenario where the data collection distribution differs from the training distribution.
+
+**Applicable scenarios include:**
+- **Policy mismatch**: Different precision (FP8 vs FP16 vs BF16 vs FP32), different backends (vLLM vs SGLang vs FSDP vs Megatron)
+- **Temporal lag**: Model staleness, asynchronous rollout workers
+- **Replay buffers**: Training on historical trajectories from earlier policy versions
+- **Off-policy algorithms**: Behavioral cloning, DAPO, expert demonstrations
+- **Data filtering**: Reweighting, preference learning, curriculum learning
+
+---
+
+## Table of Contents
+
+1. [Theoretical Foundation: From REINFORCE to Decoupled PPO](#1-theoretical-foundation-from-reinforce-to-decoupled-ppo)
+2. [Implementation in verl: The Three-Policy Framework](#2-implementation-in-verl-the-three-policy-framework)
+3. [Algorithmic Components and Combinations](#3-algorithmic-components-and-combinations)
+4. [Off-Policy Diagnostic Metrics](#4-off-policy-diagnostic-metrics)
+5. [Summary and Decision Guide](#5-summary-and-decision-guide)
+6. [Implementation References](#6-implementation-references)
+
+---
+
+## 1. Theoretical Foundation: From REINFORCE to Decoupled PPO
+
+This section establishes the theoretical progression that `verl` implements.
+
+### 1.1 REINFORCE: Policy Gradient Baseline
+
+The REINFORCE algorithm ([Williams, 1992](https://doi.org/10.1007/BF00992696)) is the foundation of policy gradient methods.
+
+**Vanilla REINFORCE (On-Policy)**
+
+For trajectories $\tau = (s_0, a_0, s_1, a_1, \ldots, s_T, a_T)$ sampled from the current policy $\pi_\theta$, the policy gradient is:
+
+$$
+\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+
+where $A_t$ is the advantage function at timestep $t$.
+
+**Off-Policy REINFORCE**
+
+When trajectories are sampled from a different behavior policy $\mu$, we apply importance sampling over the **joint trajectory distribution**:
+
+$$
+\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \mu} \left[ \frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+
+where the trajectory-level importance weight is:
+
+$$
+\frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} = \frac{p(s_0) \prod_{t=0}^T \pi_\theta(a_t|s_t) p(s_{t+1}|s_t, a_t)}{p(s_0) \prod_{t=0}^T \mu(a_t|s_t) p(s_{t+1}|s_t, a_t)} = \prod_{t=0}^T \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)}
+$$
+
+The transition dynamics $p(s_{t+1}|s_t, a_t)$ and initial state $p(s_0)$ cancel out, leaving only the product of per-step action probability ratios.
+
+**Key properties:**
+- **Off-policy capable**: Can learn from any behavior policy via importance sampling
+- **No trust region**: Policy updates not constrained
+
+**Implementation in verl:** The `bypass_pg_is` preset implements off-policy REINFORCE with truncated importance sampling.
+
+### 1.2 PPO: Adding Trust Region Control
+
+Proximal Policy Optimization ([Schulman et al., 2017](https://arxiv.org/abs/1707.06347)) adds a clipped surrogate objective:
+
+$$
+L_{\text{PPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+
+where $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)}$ and $\epsilon$ is the clip range (typically 0.2).
+
+**Key properties:**
+- **Two policies**: $\mu$ (reference for clipping) and $\pi_\theta$ (being updated)
+- **Trust region via clipping**: Limits policy update magnitude via ratio $r_t(\theta) = \frac{\pi_\theta}{\mu}$
+
+### 1.3 Decoupled PPO: Achieving Batch Size Invariance
+
+Decoupled PPO ([Hilton et al., 2021](https://arxiv.org/abs/2110.00641)) solves PPO's batch size sensitivity by **decoupling two roles**:
+1. **Proximal policy** $\pi_{\text{prox}}$: The anchor policy for PPO clipping (controls policy update size)
+2. **Behavior policy** $\mu$: The policy that collected the data (for off-policy correction via importance sampling)
+
+**The problem**: Standard PPO controls policy update size via the ratio $\frac{\pi_\theta}{\pi_{\text{old}}}$, where $\pi_{\text{old}}$ is assumed to be both the proximal policy *and* the behavior policy. This coupling makes the algorithm sensitive to batch size because aggregating data from multiple workers or using replay buffers changes the effective behavior policy.
+
+**The solution**: Decouple these two roles, leading to a **three-policy formulation**:
+
+$$
+L_{\text{DecoupledPPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+
+where:
+- $w_t = \frac{\pi_{\text{prox}}(a_t|s_t)}{\mu(a_t|s_t)}$: Importance sampling weight (corrects for behavior policy $\mu$). Here $\pi_{\text{prox}}$ is frozen during training, so $w_t$ is constant (no stopgrad operator needed).
+- $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{prox}}(a_t|s_t)}$: PPO ratio (controls policy update size against proximal policy $\pi_{\text{prox}}$)
+
+**Key properties**: By decoupling:
+- **Batch size invariance**: Policy update control (via $\pi_{\text{prox}}$) is independent of data aggregation
+- **Flexible behavior policy**: Any $\mu$ can be used (different workers, replay buffers, or stale checkpoints)
+- **Stale data utilization**: Older trajectories can be corrected via importance sampling
+- **Clipping preserved**: Clipping against $\pi_{\text{prox}}$ limits update magnitude
+
+**This is the algorithm that `verl` implements via its three-policy framework.**
+
+---
+
+## 2. Implementation in verl: The Three-Policy Framework
+
+The `verl` library implements decoupled PPO using three distinct policies, each serving a specific role.
+
+### 2.1 Policy Roles and Notation
+
+**$\pi_{\text{rollout}}$ (Behavior Policy $\mu$)**
+The policy used for data collection. This is the behavior distribution $\mu$ from theory.
+
+- **When created**: During rollout/data collection phase
+- **Purpose**: Generate trajectories for training
+- **Common sources**:
+ - Policy mismatch: Same weights, different implementation (precision, backend)
+ - Temporal lag: Stale checkpoint from async workers
+ - Replay buffer: Historical data from earlier iterations
+ - Off-policy algorithms: Expert demonstrations, auxiliary policies (DAPO)
+ - Data filtering: Reweighted or filtered data
+- **Fixed**: Frozen during training on a batch
+
+**$\pi_{\text{old}}$ (Proximal Policy $\pi_{\text{prox}}$)**
+The reference policy for PPO clipping. This is the "proximal policy" from decoupled PPO theory.
+
+- **When created**:
+ - **Decoupled mode**: Computed at start of training epoch via `actor.compute_log_prob()`
+ - **Bypass mode**: Set equal to $\pi_{\text{rollout}}$ (skips separate computation)
+- **Purpose**:
+ - Anchor point for PPO clipping (controls policy update size)
+ - When separate from $\pi_{\text{rollout}}$: Enables batch size invariance and efficient use of stale data
+- **Fixed**: Frozen during all PPO update epochs on the same batch
+
+**$\pi_{\theta}$ (Current Policy)**
+The policy being actively optimized during training.
+
+- **Updated**: Every gradient step
+- **Purpose**: The policy we're improving
+
+### 2.2 Operating Modes
+
+The three-policy framework can operate in two modes:
+
+**Decoupled Mode (Three Policies)**
+- Computes $\pi_{\text{old}}$ separately at the start of each training epoch
+- **Algorithm**: Full decoupled PPO with three policies (mathematically correct)
+- **Properties**: Achieves batch size invariance; separately corrects Drift 1 (rollout→old) and Drift 2 (old→current)
+
+**Bypass Mode (Two Policies)**
+- Sets $\pi_{\text{old}} = \pi_{\text{rollout}}$ (skips separate computation)
+- **Algorithm**: Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct)
+- **Key difference**: Proximal policy equals behavior policy, so no IS correction needed between them
+- **Properties**: Faster (skips `actor.compute_log_prob()` call); does not achieve batch size invariance
+
+### 2.3 Two Distribution Shifts
+
+The three-policy framework handles two types of distribution drift:
+
+**Drift 1: $\pi_{\text{rollout}} \to \pi_{\text{old}}$ (Off-Policy Gap)**
+
+This is the distribution shift between the data collection policy and the training reference policy.
+
+- **Nature**: Ranges from negligible (same checkpoint, minor differences) to severe (replay buffers, expert data)
+- **Correction**: Importance sampling weight $w_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$
+- **Optional**: Can be ignored (bypass mode) when negligible
+
+**Drift 2: $\pi_{\text{old}} \to \pi_{\theta}$ (Policy Update Drift)**
+
+This is the drift from policy parameter updates during training.
+
+- **Nature**: Occurs as $\pi_\theta$ is updated via gradient descent
+- **Correction**: PPO clipping on ratio $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$
+- **Universal**: Applies to both on-policy and off-policy training
+
+### 2.4 Notation Summary
+
+- $\pi_{\text{rollout}}$: Behavior policy (data collection)
+- $\pi_{\text{old}}$: Proximal policy (PPO anchor)
+- $\pi_{\theta}$: Current policy (being updated)
+- $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$: Per-token IS ratio (corrects Drift 1)
+- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio (corrects Drift 2)
+- $A_t$: Advantage at token $t$
+- $T$: Set of valid tokens in a sequence
+- $C_{\text{IS}}$: Upper threshold for IS weights (e.g., 2.0)
+- $C_{\text{RS-upper}}$: Upper threshold for RS mask (e.g., 2.0)
+- $C_{\text{RS-lower}}$: Lower threshold for RS mask (typically $1/C_{\text{RS-upper}}$)
+- $\epsilon$: PPO clip range (typically 0.2)
+
+---
+
+## 3. Algorithmic Components and Combinations
+
+The rollout correction framework in `verl` is built from **orthogonal components** that can be combined flexibly:
+
+1. **Operating Mode**: How $\pi_{\text{old}}$ is computed (Decoupled vs Bypass)
+2. **Loss Function**: PPO (with clipping) vs Pure IS (policy gradient only)
+3. **IS/RS Aggregation Level**: Token, Sequence, or Geometric
+
+This section explains each component and their valid combinations.
+
+### 3.1 Operating Modes: Decoupled vs Bypass
+
+The operating mode determines how the proximal policy $\pi_{\text{old}}$ is computed.
+
+#### 3.1.1 Decoupled Mode (Three Policies)
+
+**Configuration:** `bypass_mode = false`
+
+**Policy setup:**
+- $\pi_{\text{rollout}}$: Behavior policy (data collection)
+- $\pi_{\text{old}}$: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch)
+- $\pi_{\theta}$: Current policy (being updated)
+
+**IS ratio:** $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (corrects Drift 1: rollout→old)
+
+**PPO ratio:** $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (corrects Drift 2: old→current)
+
+**Properties:**
+- ✅ Achieves batch size invariance
+- ✅ Separately corrects two distribution drifts
+- ✅ Efficient stale data utilization
+- ❌ Extra forward pass needed (`actor.compute_log_prob()`)
+
+#### 3.1.2 Bypass Mode (Two Policies)
+
+**Configuration:** `bypass_mode = true`
+
+**Policy setup:**
+- $\pi_{\text{rollout}}$: Behavior policy (data collection)
+- $\pi_{\text{old}} = \pi_{\text{rollout}}$: Proximal policy equals behavior policy
+- $\pi_{\theta}$: Current policy (being updated)
+
+**Ratios:**
+- **With PPO-clip loss** (`loss_type = "ppo_clip"`, default): PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy (IS handled by ratio)
+- **With REINFORCE loss** (`loss_type = "reinforce"`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function
+
+**Properties:**
+- ✅ Skips `actor.compute_log_prob()` call (faster)
+- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS)
+- ✅ Uses two policies instead of three (π_rollout = π_old)
+- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode)
+
+---
+
+### 3.2 Loss Functions: PPO vs Policy Gradient
+
+#### 3.2.1 PPO Loss (with Clipping)
+
+**Configuration:** `loss_type = "ppo_clip"` (default in bypass mode)
+
+**Loss function:**
+
+$$
+L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+
+where:
+- $w_t$: IS weight (depends on aggregation level, see Section 3.3). In decoupled mode, $w_t = \frac{\pi_{\text{old}}}{\pi_{\text{rollout}}}$ where $\pi_{\text{old}}$ is frozen, so $w_t$ is constant (no stopgrad needed). In bypass mode with PPO loss, no separate IS weights are typically computed.
+- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio
+- $\epsilon$: Clip range (typically 0.2)
+
+**Properties:**
+- Trust region control via clipping
+- Limits policy update magnitude
+- Standard in RL training
+
+#### 3.2.2 Policy Gradient Loss (with IS/RS Correction)
+
+**Configuration:** `loss_type = "reinforce"` (requires `bypass_mode = true`)
+
+**Loss function** (example with sequence-level IS):
+
+$$
+L_{\text{PG}}(\theta) = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \log \pi_{\theta}(a_t|s_t) \cdot A_t \right]
+$$
+
+where:
+- $w_{\text{seq}}(\theta)$: Sample weight (IS or RS, see §3.3-3.4 for details)
+- For IS: $w_{\text{seq}}(\theta) = \min\left( \prod_{t \in T} \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}, C_{\text{IS}} \right)$
+- For RS: $w_{\text{seq}}(\theta) \in \{0, 1\}$ (binary rejection mask)
+- **stopgrad operator**: The weight $w_{\text{seq}}(\theta)$ is computed using $\pi_\theta$ but treated as a **constant coefficient** when computing $\nabla_\theta L$. This is essential for importance sampling correctness (see theoretical justification below).
+
+**Effective gradient:**
+
+$$
+\nabla_\theta L_{\text{PG}} = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \nabla_\theta \log \pi_{\theta}(a_t|s_t) \cdot A_t \right]
+$$
+
+**Theoretical Justification for stopgrad:**
+
+The stopgrad operator is **mathematically required** by importance sampling theory, not an implementation detail. Here's why:
+
+**The fundamental principle**: Importance sampling is a technique to **change the measure** (reweight samples from one distribution to estimate expectations under another), not to optimize the reweighting function itself.
+
+**Formal derivation**:
+
+1. **Original objective**: We want to optimize $J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta}[\sum_t A_t]$.
+
+2. **Off-policy setting**: We only have samples from $\pi_{\text{rollout}}$, so we use importance sampling:
+ $$
+ J(\theta) = \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ \underbrace{\frac{P_{\pi_\theta}(\tau)}{P_{\pi_{\text{rollout}}}(\tau)}}_{w(\tau;\theta)} \sum_t A_t \right]
+ $$
+
+3. **Computing the policy gradient**: The correct gradient uses the **policy gradient theorem BEFORE importance sampling**:
+ $$
+ \begin{aligned}
+ \nabla_\theta J(\theta) &= \nabla_\theta \mathbb{E}_{\tau \sim \pi_\theta}\left[\sum_t A_t\right] \\
+ &= \mathbb{E}_{\tau \sim \pi_\theta} \left[\sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(policy gradient theorem)} \\
+ &= \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ w(\tau;\theta) \sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(change of measure)}
+ \end{aligned}
+ $$
+
+ In the final line, $w(\tau;\theta)$ appears as a **multiplicative coefficient** from the change of measure, not as something we differentiate.
+
+4. **What goes wrong without stopgrad**: If we naively compute $\nabla_\theta \left[w(\theta) \log \pi_\theta \right]$ in the loss, we get:
+ $$
+ \nabla_\theta \left[w(\theta) \log \pi_\theta \right] = \underbrace{\log \pi_\theta \cdot \nabla_\theta w(\theta)}_{\text{WRONG: bias term}} + \underbrace{w(\theta) \cdot \nabla_\theta \log \pi_\theta}_{\text{CORRECT: IS-weighted gradient}}
+ $$
+
+ The first term $\log \pi_\theta \cdot \nabla_\theta w(\theta)$ is an artifact of the computational trick (using loss times log-prob), not part of the true policy gradient. It biases the gradient estimator and optimizes a different objective than $J(\theta)$.
+
+5. **Implementation requirement**: In PyTorch, to compute only the second term, we must use:
+ ```python
+ loss = -advantages * log_prob * rollout_is_weights.detach() # stopgrad on weights
+ ```
+ Without `.detach()`, autograd computes both terms, giving an incorrect gradient.
+
+**Intuition**: The IS weight $w(\theta)$ tells us "how much to trust this sample" for estimating the gradient under $\pi_\theta$. We update $\theta$ to maximize the reweighted objective, but we don't update $\theta$ to maximize the weight itself—that would be circular reasoning (optimizing the correction factor instead of the actual objective).
+
+**Properties:**
+- **Algorithm**: Off-policy policy gradient with IS/RS correction
+- **Loss types** (`loss_type` config option in bypass mode):
+ - `"ppo_clip"` (default): PPO clipped objective
+ - $L = -\mathbb{E}[\min(r \cdot A, \text{clip}(r) \cdot A)]$ where $r = \pi_\theta / \pi_{\text{rollout}}$
+ - Note: IS weights NOT applied (PPO ratio already handles it; would be double-counting)
+ - `"reinforce"`: Pure policy gradient with explicit IS weights, no PPO clipping
+ - $L = -\mathbb{E}[w \cdot \log \pi_\theta(a|s) \cdot A]$ where $w = \pi_\theta / \pi_{\text{rollout}}$
+- **Always uses bypass mode**: Direct $\pi_\theta$ to $\pi_{\text{rollout}}$ comparison
+- **Fast**: Single forward pass
+
+**Implementation:** `compute_policy_loss_bypass_mode()` and `compute_policy_loss_reinforce()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py)
+
+---
+
+### 3.3 IS/RS Aggregation Levels
+
+The aggregation level determines how per-token probability ratios are combined into IS weights and/or rejection masks. This choice is **orthogonal to the operating mode** - you can use any aggregation level in either decoupled or bypass mode.
+
+#### 3.3.1 Token-Level Aggregation
+
+**IS weights:** $w_t = \min(\rho_t, C_{\text{IS}})$ where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (decoupled) or $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (bypass/pure IS)
+
+**Configuration:**
+```python
+rollout_is = "token" # IS weights
+rollout_rs = "token_k1" # Optional: rejection sampling (ratio bounds)
+```
+
+**Properties:**
+- Independent truncation per token
+- Lower variance than sequence-level (product of ratios bounded individually)
+- **Bias-variance tradeoff**: Token-level correction has $O(T^2 \Delta_{\max})$ bias where $T$ is sequence length and $\Delta_{\max}$ is maximum per-token policy divergence. This bias becomes significant when the rollout policy deviates substantially from the training policy. Sequence-level correction is unbiased but has higher variance.
+- Typical threshold: 1.5 - 5.0
+- Optional batch normalization (§3.6): Normalizes over all token weights to ensure $\mathbb{E}[\tilde{w}_t] = 1$ (reduces variance)
+- **When to use**: Token-level works well when rollout policy stays within the trust region of training policy. When mismatch is significant, the bias becomes intolerable and sequence-level correction is preferred.
+
+**Loss function (REINFORCE + Token IS):**
+
+$$
+L_{\text{REINFORCE+TIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_t) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+
+where $w_t = \min(\rho_t, C_{\text{IS}})$ are the truncated token-level IS weights. The stopgrad operator ensures that when computing $\nabla_\theta L$, the weights are treated as constants (see §3.2.2 for theoretical justification). This formulation can also be combined with PPO clipping by replacing the REINFORCE gradient with the clipped surrogate objective.
+
+**Implementation:**
+- IS weights: `compute_rollout_correction_weights()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L325-L402)
+- Loss: `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884)
+
+#### 3.3.2 Sequence-Level Aggregation
+
+**IS weights:** $w_{\text{seq}} = \min\left( \prod_{t \in T} \rho_t, C_{\text{IS}} \right) = \min\left( \exp\left(\sum_{t \in T} \log \rho_t\right), C_{\text{IS}} \right)$ (broadcast to all tokens)
+
+**Configuration:**
+```python
+rollout_is = "sequence" # IS weights
+rollout_rs = "seq_sum_k1" # Optional: rejection sampling
+```
+
+**Properties:**
+- Multiplicative aggregation across sequence
+- More sensitive to outliers than token-level
+- Typical threshold: 2.0 - 10.0
+- Optional batch normalization (§3.6): Normalizes over sequence means (one weight per sequence)
+
+**Terminology Note:**
+- **Seq-TIS (Sequence-Level Truncated IS)**: Clips the sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$. Maximizes information efficiency by extracting signal from all samples. Best for clean data with moderate mismatch.
+- **Seq-MIS (Sequence-Level Masked IS)**: Rejects (masks) sequences with $\rho(\tau) > C$ instead of clipping. Acts as a hard trust region filter. Best for severe mismatch or when the distribution tail is "toxic" (contains garbage/adversarial samples rather than signal).
+
+**Loss function (REINFORCE + Sequence IS):**
+
+$$
+L_{\text{REINFORCE+SeqIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_{\text{seq}}) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+
+where $w_{\text{seq}}$ is broadcast to all tokens in the sequence. The stopgrad operator ensures correct IS gradient computation (see §3.2.2). This formulation can also be combined with PPO clipping.
+
+#### 3.3.3 Geometric Mean Aggregation (Geo-RS)
+
+**Geometric mean ratio:** $\rho_{\text{geo}} = \exp\left( \frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{1/|T|}$ (broadcast to all tokens)
+
+**Configuration:**
+```python
+rollout_is = null # No IS weights, pure rejection
+rollout_rs = "seq_mean_k1" # Geometric mean rejection sampling (ratio bounds)
+```
+
+**Properties:**
+- Length-invariant (normalizes by sequence length)
+- Ideal ratio = 1.0 (policies match)
+- Typical bounds: `"0.999_1.001"` (~±0.1%)
+- **Used for rejection sampling only, not IS weighting**
+
+**The Length Trap Problem:**
+
+Standard IS estimators have a systematic **length bias** that penalizes long sequences. The importance ratio $\rho(y)$ is multiplicative:
+
+$$
+\rho(y) = \prod_{t=1}^T \frac{\pi(y_t|y_{= 0 per token (equals 0 when ρ = 1)
+- More stable than geometric ratio checks because each token term is non-negative
+- Only upper threshold applies (no lower threshold since K3 >= 0)
+- Typical threshold: 0.001 - 0.01
+
+**Why K3 over geometric ratio?**
+- Geometric ratio uses average log-ratio; small numerical bias can flip sign
+- K3 = E[ρ - log ρ - 1] is non-negative per token, offering a smoother detector
+- Both estimate the same quantity: KL(π_rollout || π_old)
+- For small divergences, K3 ≈ 0.5 × Var(log_ratio)
+
+**Combined Estimator (K3-RS-Token-TIS):**
+
+For best results, combine K3 filter with token-level IS weights:
+
+$$
+\hat{g}_{\text{k3-rs-token-tis}}(y) = \underbrace{\mathbb{I}\left( K3_{\text{seq}} \le C_{\text{k3}} \right)}_{\text{K3 Filter}} \cdot \prod_t \min(\rho_t, C) \cdot f(y)
+$$
+
+This is implemented by combining `rollout_rs="k3"` with `rollout_is="token"`.
+
+
+---
+
+### 3.4 Batch Normalization
+
+An optional variance reduction technique that normalizes IS weights to have mean 1.0 within each batch.
+
+**Configuration:**
+```python
+rollout_is_batch_normalize = True # Default: False
+```
+
+**Normalization formula (aggregation-aware):**
+
+For **token-level IS** (§3.3.1):
+
+$$
+\tilde{w}_t = \frac{w_t}{\frac{1}{\sum_{i,t} m_{i,t}} \sum_{i,t} w_{i,t} \cdot m_{i,t}}
+$$
+
+where $w_{i,t}$ are truncated token IS weights, $m_{i,t}$ is the response mask, and normalization is over **all tokens**.
+
+For **sequence-level IS** (§3.3.2):
+
+$$
+\tilde{w}_i = \frac{w_i}{\frac{1}{B}\sum_{j=1}^B \bar{w}_j}
+$$
+
+where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the per-sequence mean (all tokens in a sequence have the same weight), and normalization is over **sequences**.
+
+**Properties:**
+- Applied **after** truncation to preserve truncation semantics
+- Ensures $\mathbb{E}[\tilde{w}] = 1$ within each batch
+- **Aggregation-aware**: Token-level normalizes over tokens; sequence-level normalizes over sequences
+- Uses `masked_mean` to respect padding tokens
+- Reduces gradient magnitude variance by removing random batch-level scale fluctuations
+
+**Metrics:**
+- `rollout_is_batch_norm_factor`: The normalization factor applied (batch mean before normalization)
+
+**Implementation:** [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L401-L421)
+
+---
+
+### 3.5 Rejection Sampling (RS)
+
+Rejection sampling can be added to **any combination** of operating mode and aggregation level. It modifies the `response_mask` to exclude outlier tokens/sequences.
+
+**Configuration examples:**
+```python
+rollout_rs = "token_k1" # Token-level ratio bounds
+rollout_rs_threshold = "0.6_1.6"
+
+rollout_rs = "seq_sum_k1" # Sequence sum of log ratios
+rollout_rs_threshold = "0.5_2.0"
+
+rollout_rs = "seq_mean_k3" # Sequence mean of K3 divergence
+rollout_rs_threshold = 0.01
+```
+
+**Acceptance set:**
+- **Token-level**: $\mathcal{A}_{\text{token}} = \{ t : C_{\text{RS-lower}} \leq \rho_t \leq C_{\text{RS-upper}} \}$
+- **Sequence-level**: $\mathcal{A}_{\text{seq}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \prod_{t \in T} \rho_t \leq C_{\text{RS-upper}} \}$
+- **Geometric**: $\mathcal{A}_{\text{geo}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \rho_{\text{geo}} \leq C_{\text{RS-upper}} \}$
+
+**Properties:**
+- Separate from IS weighting (can use RS without IS)
+- Reduces effective sample size
+- Filters extreme outliers
+
+**Implementation:** `compute_rollout_rejection_mask()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L80-L188)
+
+---
+
+### 3.6 Combination Matrix
+
+**Key insight:** Estimators (how IS/RS is computed) and operating modes (decoupled PPO vs bypass PG) are **orthogonal**. Any estimator can be combined with any operating mode.
+
+#### Estimator × Operating Mode
+
+| Estimator | Configuration | Compatible Modes |
+|-----------|---------------|------------------|
+| **Token-TIS** | `rollout_is="token"` | Decoupled PPO, Bypass PG |
+| **Seq-TIS** | `rollout_is="sequence"` | Decoupled PPO, Bypass PG |
+| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Decoupled PPO, Bypass PG |
+| **Geo-RS** | `rollout_rs="seq_mean_k1"` (geometric mean) | Decoupled PPO, Bypass PG |
+| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Decoupled PPO, Bypass PG |
+| **K3-RS** | `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG |
+| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG |
+
+**Note:** In bypass mode, `loss_type` controls the loss function. Use "ppo_clip" (default) or "reinforce".
+
+#### Available Preset Methods
+
+| Preset Method | Estimator | Mode | Properties |
+|---------------|-----------|------|------------|
+| **Decoupled PPO Mode** (3 policies: π_rollout, π_old, π_θ) |
+| `decoupled_token_is()` | Token-TIS | Decoupled PPO | Per-token IS weights |
+| `decoupled_seq_is()` | Seq-TIS | Decoupled PPO | Sequence-level IS weights |
+| `decoupled_seq_is_rs()` | Seq-MIS | Decoupled PPO | Sequence IS + sequence RS |
+| `decoupled_geo_rs()` | Geo-RS | Decoupled PPO | Geometric RS + seq\_max\_k2 guard |
+| `decoupled_geo_rs_token_tis()` | Geo-RS-Token-TIS | Decoupled PPO | Geometric filter + token IS |
+| **K3 KL Estimator** (more stable for small KL values) |
+| `decoupled_k3_rs()` | K3-RS | Decoupled PPO | K3 rejection, no IS weights |
+| `decoupled_k3_rs_token_tis()` | K3-RS-Token-TIS | Decoupled PPO | K3 filter + token clipped weight |
+| **Bypass Mode (PPO-clip)** (ratio handles IS, RS masks outliers) |
+| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | PPO-clip only |
+| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | PPO-clip + Geo-RS (ratio) |
+| `bypass_ppo_clip_k3_rs()` | K3-RS | Bypass (PPO-clip) | PPO-clip + K3-RS |
+| **Bypass Mode (REINFORCE)** (explicit IS weights, no PPO clipping) |
+| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | REINFORCE + Seq IS |
+| `bypass_pg_geo_rs()` | Geo-RS | Bypass (REINFORCE) | REINFORCE + Geo-RS (ratio) |
+| `bypass_pg_geo_rs_token_tis()` | Geo-RS-Token-TIS | Bypass (REINFORCE) | REINFORCE + Geo filter + token IS |
+| **Other** |
+| `disabled()` | - | - | Metrics only |
+
+**Note:** Bypass mode sets π_old = π_rollout and uses `loss_type` to select the loss function.
+
+#### Additional Supported Combinations (Manual Configuration)
+
+These combinations are **fully supported** but require manual configuration:
+
+**1. Token IS + Token RS**
+```python
+config = RolloutCorrectionConfig(
+ rollout_is="token",
+ rollout_is_threshold=2.0,
+ rollout_rs="token_k1",
+ rollout_rs_threshold="0.5_2.0",
+)
+```
+**Properties:** Token-level IS weights + token-level RS mask.
+
+**2. Pure Token RS**
+```python
+config = RolloutCorrectionConfig(
+ rollout_is=None,
+ rollout_rs="token_k1",
+ rollout_rs_threshold="0.5_2.0",
+)
+```
+**Properties:** Token-level RS mask only, no IS weights.
+
+**3. Pure Sequence RS**
+```python
+config = RolloutCorrectionConfig(
+ rollout_is=None,
+ rollout_rs="seq_sum_k1",
+ rollout_rs_threshold="0.5_2.0",
+)
+```
+**Properties:** Sequence-level RS mask only, no IS weights.
+
+**Key properties:**
+- Any IS aggregation level (token/sequence) can be used in either decoupled or bypass mode
+- Rejection sampling can be added to any combination
+- Geometric aggregation is typically used for RS only (not IS weighting)
+- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` for REINFORCE (no IS weights)
+- All combinations in the table above are valid and supported by the implementation
+
+---
+
+### 3.7 Common Implementation Mistake
+
+#### Incorrect LLM-RL Implementation (PPO Without Rollout Correction)
+
+**Theory:** Naive LLM-RL implementation that incorrectly applies PPO by **ignoring the actual rollout policy** and assuming $\pi_{\text{old}} = \pi_{\text{rollout}}$.
+
+**Note:** This incorrect implementation pattern was identified in [Liu, Li, et al. (2025)](https://richardli.xyz/rl-collapse) as a key cause of training instability in LLM-RL systems, motivating the development of this rollout correction framework.
+
+**Loss Function:**
+
+$$
+L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+
+where $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (ignores $\pi_{\text{rollout}}$).
+
+**Why it's wrong:**
+- **Ignores $\pi_{\text{rollout}}$**: Uses $\pi_{\text{old}}$ as behavior policy instead of actual $\pi_{\text{rollout}}$
+- **Policy mismatch**: In LLM-RL, rollout typically uses different precision/backend/checkpoint than training, causing $\pi_{\text{rollout}} \neq \pi_{\text{old}}$ even with same model weights
+- **Not PPO's fault**: PPO itself is correct; the issue is the incorrect assumption
+
+**Correct alternatives:**
+1. **Decoupled mode**: Three policies with IS correction from $\pi_{\text{rollout}}$ to $\pi_{\text{old}}$
+2. **Bypass mode**: Two policies using $\pi_{\text{rollout}}$ as both behavior policy and proximal policy
+3. **Bypass + Policy Gradient mode**: Two policies with IS/RS correction and no PPO clipping
+
+**Implementation:** `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884)
+
+---
+
+## 4. Off-Policy Diagnostic Metrics
+
+These metrics quantify the severity of off-policy drift.
+
+**Note on notation:** Metrics use $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$. In bypass mode, $\pi_{\text{old}} = \pi_{\text{rollout}}$, so metrics measure rollout→current drift using $\rho_t = \frac{\pi_{\theta}}{\pi_{\text{rollout}}}$ instead.
+
+### 4.1 KL Divergence
+
+**Direct KL estimator:**
+
+$$
+\text{KL}(\pi_{\text{rollout}} \| \pi_{\text{old}}) = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \log \pi_{\text{rollout}}(a_t|s_t) - \log \pi_{\text{old}}(a_t|s_t) \right]
+$$
+
+**K3 KL estimator** (alternative formulation):
+
+$$
+\text{KL}_{\text{K3}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t - \log \rho_t - 1 \right]
+$$
+
+where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$.
+
+### 4.2 Perplexity
+
+**Old policy perplexity:**
+
+$$
+\text{PPL}_{\text{old}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{old}}(a_t|s_t) \right)
+$$
+
+**Rollout policy perplexity:**
+
+$$
+\text{PPL}_{\text{rollout}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{rollout}}(a_t|s_t) \right)
+$$
+
+**PPL ratio** (inverse of geometric mean IS weight):
+
+$$
+\text{PPL}_{\text{ratio}} = \frac{\text{PPL}_{\text{old}}}{\text{PPL}_{\text{rollout}}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{-1/|T|}
+$$
+
+**Interpretation:** Values > 1 mean $\pi_{\text{old}}$ assigns lower probability than $\pi_{\text{rollout}}$ to the observed actions (distribution shift).
+
+### 4.3 Chi-squared Divergence
+
+Measures the second moment of the IS weight distribution.
+
+**Token-level:**
+
+$$
+\chi^2_{\text{token}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t^2 \right] - 1
+$$
+
+**Sequence-level:**
+
+$$
+\chi^2_{\text{seq}} = \mathbb{E}_{\text{seq} \sim \pi_{\text{rollout}}} \left[ \left(\prod_{t \in T} \rho_t\right)^2 \right] - 1
+$$
+
+**Interpretation:**
+- $\chi^2 = 0$: Policies are identical
+- $\chi^2 > 0$: Higher values indicate more severe off-policy distribution shift
+
+**Implementation:** `compute_offpolicy_metrics()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L670-L776)
+
+---
+
+## 5. Summary and Decision Guide
+
+### 5.1 Method Summary Table
+
+| Method | Theory | Policies | PPO Clip | IS Correction | Correctness | Speed |
+|--------|--------|----------|----------|---------------|-------------|-------|
+| **Bypass Mode** (π_old = π_rollout, `loss_type` selects algorithm) |
+| `loss_type="ppo_clip"` (default) | PPO (ratio = π_θ/π_rollout) | 2 (rollout, θ) | ✅ | RS mask only (ratio handles IS) | ✅ Correct | **Fast** |
+| `loss_type="reinforce"` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ (explicit IS weights) | ✅ Correct | **Fast** |
+| **Bypass Mode Presets (PPO-clip)** |
+| `bypass_ppo_clip` | PPO only | 2 (rollout, θ) | ✅ | - | ✅ Correct | **Fast** |
+| `bypass_ppo_clip_geo_rs` | PPO + Geo-RS | 2 (rollout, θ) | ✅ | Geo-RS mask (ratio) | ✅ Correct | **Fast** |
+| **Bypass Mode Presets (REINFORCE)** |
+| `bypass_pg_is` | REINFORCE + Seq-TIS | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** |
+| `bypass_pg_geo_rs` | REINFORCE + Geo-RS | 2 (rollout, θ) | ❌ | Geo-RS only (ratio) | ✅ Correct | **Fast** |
+| `bypass_pg_geo_rs_token_tis` | REINFORCE + Geo RS + Token IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Token-TIS | ✅ Correct | **Fast** |
+| **Decoupled PPO Mode** (IS weights = π_old / π_rollout) |
+| `decoupled_token_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Token-TIS | ✅ Correct | Standard |
+| `decoupled_seq_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Seq-TIS | ✅ Correct | Standard |
+| `decoupled_seq_is_rs` | Decoupled PPO + RS | 3 (rollout, old, θ) | ✅ | ✅ Seq-MIS | ✅ Correct | Standard |
+| `decoupled_geo_rs` | Decoupled PPO + Geo-RS | 3 (rollout, old, θ) | ✅ | Geo-RS only (ratio) | ✅ Correct | Standard |
+| `decoupled_geo_rs_token_tis` | Decoupled PPO + Geo RS + Token IS | 3 (rollout, old, θ) | ✅ | ✅ Geo-RS-Token-TIS | ✅ Correct | Standard |
+| **Incorrect (for reference)** |
+| Naive LLM-RL | Incorrect PPO usage | 2 (old, θ) | ✅ | ❌ | ⚠️ Incorrect | Standard |
+
+**Notes:**
+- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
+ - `"ppo_clip"` (default): PPO clipped ratio (IS handled by ratio = π_θ/π_rollout, no explicit IS weights to avoid double-counting)
+ - `"reinforce"`: Explicit IS weights applied as $w \cdot \log \pi \cdot A$
+- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples
+
+### 5.2 Estimator Hierarchy
+
+These estimators define **how IS weights and rejection masks are computed**. They are orthogonal to the operating mode (decoupled PPO vs bypass policy gradient) and can be combined with either.
+
+| Estimator | Configuration | Mechanism | Best For |
+|-----------|---------------|-----------|----------|
+| **Token-TIS** | `rollout_is="token"` | Clips per-token ratios | Lower variance IS with acceptable bias |
+| **Seq-TIS** | `rollout_is="sequence"` | Clips sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$ | Clean data with moderate mismatch; unbiased |
+| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Rejects sequences with $\rho(\tau) > C$ | Severe mismatch; filters "toxic tail" (garbage data) |
+| **Geo-RS** | `rollout_rs="seq_mean_k1"` | Rejects on geometric mean ratio exp(E[log(r)]) | Length-invariant trust region |
+| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Geometric filter + token IS weights | Ratio-based length normalization + lower variance IS |
+| **K3-RS** | `rollout_rs="seq_mean_k3"` | Rejects on K3 KL divergence | Small KL values; smooth detector |
+| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | K3 filter + token IS weights | Small KL + lower variance IS |
+
+**Note:** Each estimator can be used with either:
+- **Decoupled PPO** (`bypass_mode=false`): Three policies with PPO clipping
+- **Bypass Mode** (`bypass_mode=true`): Two policies with configurable loss type
+ - `loss_type="ppo_clip"` (default): PPO clipped objective (IS via ratio, RS mask applied)
+ - `loss_type="reinforce"`: REINFORCE with explicit IS weights
+
+### 5.3 Method Characteristics by Scenario
+
+**Choosing estimator by off-policy severity:**
+- **Negligible** (same checkpoint, minor differences): No IS correction needed; use bypass mode for efficiency
+- **Moderate** (async workers, slight staleness): Token-TIS provides per-token IS correction with lower variance
+- **Severe** (replay buffers, old data): Seq-TIS or Seq-MIS provides sequence-level IS correction; use Seq-MIS when high-weight samples are likely garbage
+
+**Choosing estimator by sequence length:**
+- **Short sequences** (standard chat): Seq-TIS is optimal
+- **Long sequences** (CoT, agents): K1-RS or K1-RS-Token-TIS to avoid Length Trap
+
+**Choosing operating mode:**
+- **Batch size invariance needed**: Use decoupled mode (`bypass_mode=false`)
+- **Computational efficiency needed**: Use bypass mode (`bypass_mode=true`) to skip `old_log_prob` computation
+- **No PPO clipping**: Use bypass mode with `loss_type="reinforce"`
+
+### 5.4 Decoupled Mode vs Bypass Mode
+
+**Decoupled mode** (computes `old_log_prob` separately):
+- Implements full decoupled PPO with three policies (mathematically correct)
+- Separately measures and corrects Drift 1 (rollout→old) and Drift 2 (old→current)
+- Achieves batch size invariance and efficient stale data utilization
+- Enables accurate off-policy metrics monitoring
+
+**Bypass mode** (sets $\pi_{\text{old}} = \pi_{\text{rollout}}$):
+- Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct)
+- Computational efficiency: Skips separate `old_log_prob` computation
+- Does not achieve batch size invariance (proximal policy depends on data collection)
+
+---
+
+## 6. Implementation References
+
+- **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical configuration and troubleshooting
+- **Config:** [verl/trainer/config/algorithm.py](../../verl/trainer/config/algorithm.py)
+- **IS/RS Helper:** [verl/trainer/ppo/rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py)
+- **PPO Loss:** [verl/trainer/ppo/core_algos.py](../../verl/trainer/ppo/core_algos.py)
+- **Tests:** [tests/trainer/ppo/test_rollout_corr.py](../../tests/trainer/ppo/test_rollout_corr.py)
+
+---
+
+## References
+
+- **Williams, R. J. (1992).** "Simple statistical gradient-following algorithms for connectionist reinforcement learning." *Machine Learning*, 8(3-4), 229-256. https://doi.org/10.1007/BF00992696
+- **Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017).** "Proximal policy optimization algorithms." *arXiv preprint arXiv:1707.06347.* https://arxiv.org/abs/1707.06347
+- **Hilton, J., Cobbe, K., & Schulman, J. (2021).** "Batch size-invariance for policy optimization." *arXiv preprint arXiv:2110.00641.* https://arxiv.org/abs/2110.00641
+ - Introduced decoupled PPO: separating proximal policy (for controlling policy update size) from behavior policy (for off-policy correction) to achieve batch size invariance
+- **Liu, J., Li, Y., et al. (2025).** "When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch"
+ - Blog post: https://richardli.xyz/rl-collapse (see Blog Series above for parts 1-3)
diff --git a/code/RL_model/verl/verl_train/docs/algo/spin.md b/code/RL_model/verl/verl_train/docs/algo/spin.md
new file mode 100644
index 0000000000000000000000000000000000000000..9349cef976f551a1f60376585f88da2313bdc3f7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/spin.md
@@ -0,0 +1,179 @@
+# Recipe: Self-Play Fine-Tuning (SPIN)
+
+Last updated: 05/31/2025.
+
+`verl` provides a recipe inspired by the paper **"Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models"** (SPIN). SPIN is a language model finetuning algorithm that enables iterative self-improvement through a self-play mechanism inspired by game theory.
+
+**Core Idea:** Models learn by playing against themselves, reducing reliance on external preference datasets or stronger teacher models:
+
+1. **Synthetic Data Generation:** The current model generates responses, creating its own training data from previous iterations.
+2. **Two-Player Game Setup:** A game involving two players acted by a single LLM.
+3. **Iterative Training:** The model progressively improves by refining its policy, with each iteration's model becoming the opponent for the next iteration.
+
+Paper Authors: [Zixiang Chen](https://github.com/uclaml/SPIN)\*, [Yihe Deng](https://github.com/uclaml/SPIN)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
+
+[[Webpage](https://uclaml.github.io/SPIN/)] [[Huggingface](https://huggingface.co/papers/2401.01335)] [[Paper](https://arxiv.org/abs/2401.01335)] [[Original Implementation](https://github.com/uclaml/SPIN)]
+
+verl Implementation Authors: [Chendong Wang](https://cdwang96.github.io/), [Chenyang Zhao](https://github.com/zhaochenyang20)
+
+---
+
+## Key Function (compute_online_dpo_loss) and Related works
+SPIN (Chen et al., 2024) proposes an iterative self-play mechanism to fine-tune language models. In each iteration, SPIN's training objective, when using a logistic loss function, is equivalent to Direct Preference Optimization (DPO) loss (Rafailov et al., 2023).
+
+This `verl` recipe realizes SPIN's core concept by using DPO loss iteratively (Xu et al., 2023; Xiong et al., 2023; Snorkel AI, 2024). This means that in each iteration, we fine-tune the LLM using DPO loss for preference optimization. Notably, Xu et al. (2023) explored iterative preference optimization with pairwise cringe loss, while Xiong et al. (2023) discussed how to bridge theory and practice for RLHF under KL constraints using iterative training. The concept of iterative preference learning was also explored in online DPO (Guo et al., 2024), which focuses on direct alignment from online AI feedback. In online DPO, preference data is dynamically updated during training, allowing the model to learn from its own generated data.
+
+Specifically, we developed the **`compute_online_dpo_loss`** function and built this SPIN recipe on top of it. By incorporating online preference generation, this approach enables continuously refining language models without relying on fixed external preference datasets.
+
+**Reference Papers:**
+* [Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models](https://arxiv.org/abs/2401.01335) (Chen et al., 2024)
+* [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2305.18290) (Rafailov et al., 2023)
+* [Somethings are more cringe than others: Preference optimization with the pairwise cringe loss](https://arxiv.org/abs/2312.16682) (Xu et al., 2023)
+* [Iterative preference learning from human feedback: Bridging theory and practice for rlhf under kl-constraint](https://arxiv.org/abs/2312.11456) (Xiong et al., 2023)
+* [Snorkel-Mistral-PairRM-DPO](https://huggingface.co/snorkelai/Snorkel-Mistral-PairRM-DPO) (Snorkel AI, 2024)
+* [Direct language model alignment from online ai feedback](https://arxiv.org/abs/2402.04792) (Guo et al., 2024)
+
+
+## Our Online DPO Implementation
+
+Our `compute_online_dpo_loss` function adapts `verl`'s existing PPO infrastructure (based on `verl` v0.3.0.post1) for this iterative online DPO. Key aspects of our implementation include:
+
+* **No Critic:** Unlike PPO, we omit the value function critic.
+* **Dynamic Reference Model:** An explicit reference policy (`ref_policy_wg`) is used for DPO loss. This reference model's weights can be periodically updated from the actor (`ref_update_freq`), providing a dynamic baseline.
+* **Online Preference Generation:** The `compute_onlineDPO_pref` function (in `core_algos.py`) dynamically creates chosen/rejected pairs based on a reward source (e.g., rule-based ranking for math problems).
+* **DPO Loss Integration:** We replace PPO's policy loss with our `compute_online_dpo_loss` (in `core_algos.py`) within the actor update (`dp_actor.py`), directly optimizing the policy using the generated preferences.
+* **Iterative Training Orchestration:** The `SpinTrainer` (in `spin_trainer.py`) manages the entire self-play loop: generation, preference labeling, optional reference model updates, and policy updates, enabling continuous self-improvement aligned with SPIN's principles.
+
+---
+## Algorithm
+
+This recipe implements an Online algorithm adapted to the `verl` Reinforcement Learning framework, which provides an alternative to PPO for fine-tuning language models.
+
+**Online Loop:** Instead of maximizing a scalar reward signal in PPO, this approach directly optimizes the policy model to align with preference data generated *online* during training:
+
+1. **Generation:** The current model generates multiple responses for each prompt in a batch.
+2. **Preference Labeling:** A function evaluates these generated responses to determine which one is preferred (chosen) and which is dispreferred (rejected). This can be done using a reward function or implicit ranking based on specific rules. (In this recipe, we use rule-based ranking on the math problem).
+3. **Update:** This preference tuple (`prompt`, `chosen_response`, `rejected_response`) is used to update the actor model using `compute_online_dpo_loss`, comparing against a reference model.
+
+**Connection with SPIN:**
+Instead of only using a fixed target data distribution, the online generation loop in step 2 will dynamically change the target data distribution by using a certain Preference Labeling method (rule-based ranking on the math problem by selecting the better one in this recipe). This explores the direction mentioned in SPIN's paper Section 7 about "dynamically changing target data distribution" to potentially elevate LLM performance beyond the fixed human-annotated data ceiling.
+
+---
+
+## Reproduce the Experiment (Example Setup)
+
+The following steps outline how to set up the environment and run the SPIN recipe, based on the provided test log using GSM8K and Qwen2.5-3B-Instruct.
+
+1. **Setup Environment (Example using Docker):**
+ ```bash
+ # Start a container with GPU access and shared memory
+ docker run -it --name spin_test --gpus all \
+ --shm-size=32g \
+ --ipc=host \
+ -v /path/to/host/.cache:/root/.cache \
+ -e HF_TOKEN= \
+ lmsysorg/sglang:latest \
+ /bin/bash
+
+ # Inside the container or on your host machine:
+ # Ensure /tmp is writable
+ mkdir -p /tmp
+ chmod 1777 /tmp
+
+ # Install Python 3.10 (if not present) and venv
+ sudo apt update
+ sudo apt install -y python3.10 python3.10-venv tmux
+ python3 -m ensurepip --upgrade
+
+ # Create and activate a virtual environment
+ python3 -m venv ~/.python/spin_env
+ source ~/.python/spin_env/bin/activate
+
+ # Install uv (fast package installer)
+ python3 -m pip install uv
+ ```
+
+2. **Install verl and Dependencies:**
+ ```bash
+ # Clone the verl repository and checkout the spin branch
+ cd ~
+ git clone git@github.com:volcengine/verl.git && cd verl
+
+ # Install flash-attn (handle potential build issues)
+ python3 -m uv pip install wheel packaging
+ python3 -m uv pip install flash-attn --no-build-isolation --no-deps
+
+ # Install verl with sglang extras
+ python3 -m uv pip install -e ".[sglang]"
+ ```
+ *Note: If `flash-attn` installation fails, try the manual steps again or consult its documentation.*
+
+3. **Login & Download Data/Model:**
+ ```bash
+ # Login to Weights & Biases (optional, for logging)
+ export WANDB_API_KEY=
+ # wandb login
+
+ # Download the GSM8K dataset
+ python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
+
+ # Download the base model (Example: Qwen2.5-3B-Instruct)
+ hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+ ```
+
+4. **Configure:**
+ * Modify the configuration file (e.g., `config/spin_trainer.yaml` or the one specified in the run script) with correct paths to your downloaded model, data, desired hyperparameters (`dpo_beta`, learning rate, etc.), and distributed training settings (nodes, GPUs per node).
+ * Pay attention to `actor_rollout_ref.model`, `data` paths, `reward_model` config (if using one), and `trainer.ref_update_freq`.
+
+5. **Run Training:**
+ ```bash
+ # Set CUDA visible devices (adjust based on your hardware and config)
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+ # Launch the training script (e.g., test.sh or a custom script)
+ # Ensure test.sh points to the correct config and main script
+ bash recipe/spin/run_spin.sh
+ ```
+
+---
+
+## Configuration
+
+* The primary configuration is typically managed through a YAML file specified in the launch script (e.g., `config/spin_trainer.yaml`).
+* Key configuration sections:
+ * `data`: Paths to training/validation prompt files, batch sizes, sequence lengths.
+ * `actor_rollout_ref`: Paths to the base model (used for actor and initial reference), FSDP settings, optimization parameters (learning rate, scheduler).
+ * `reward_model`: Configuration for the reward model used for online preference labeling (path, batch size, etc.). Can be omitted if using a simpler reward function.
+ * `algorithm`: DPO-specific hyperparameters like `dpo_beta`, `dpo_loss_type`.
+ * `trainer`: Distributed training settings (nodes, GPUs per node), logging (WandB), checkpointing frequency, and `ref_update_freq` (set > 0 to enable periodic reference model updates from the actor).
+
+---
+
+## Key Files
+
+* `main_spin.py`: Main entry point using Hydra to load the config and launch the `SpinTrainer`.
+* `spin_trainer.py`: Defines the `SpinTrainer` class, orchestrating the Online DPO training loop.
+* `fsdp_workers.py`: Implements Ray workers (Actor, Reference) potentially using FSDP.
+* `dp_actor.py`: Contains the actor class, including the DPO policy update logic.
+* `core_algos.py`: Includes helper functions for `compute_online_dpo_loss` and `compute_onlineDPO_pref`.
+* `config/spin_trainer.yaml` (or similar): Main Hydra configuration file for the recipe.
+* `run_spin.sh` (or similar): Example bash script for launching a training run.
+* `README.md`: This file.
+
+---
+
+## Acknowledgement
+
+We sincerely thank the contribution and guidance from the `verl` community and advisors, including (adapted from SPPO):
+
+* [Zixiang Chen](https://sites.google.com/view/zxchen)
+* [Yuhao Yang](https://github.com/yhyang201)
+* [Yifan Zhang](https://github.com/yifanzhang-pro)
+* [Yongan Xiang](https://github.com/BearBiscuit05)
+* [Junrong Lin](https://github.com/ocss884)
+* [Yuxuan Tong](https://github.com/tongyx361)
+* [Guangming Shen](https://github.com/PeterSH6)
+* [Biao He](https://www.linkedin.com/in/biao-he/)
+* [Qingquan Song](https://qingquansong.github.io/)
+* [Chenyang Zhao](https://zhaochenyang20.github.io/Chayenne/)
+* [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
diff --git a/code/RL_model/verl/verl_train/docs/algo/sppo.md b/code/RL_model/verl/verl_train/docs/algo/sppo.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec9679987a1f1dde7163cc69c0a93c83d3811db7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/algo/sppo.md
@@ -0,0 +1,52 @@
+# Recipe: Self-Play Preference Optimization (SPPO)
+
+Last updated: 05/28/2025.
+
+verl provides a community recipe implementation for the paper [Self-Play Preference Optimization for Language Model Alignment](https://arxiv.org/abs/2405.00675). SPPO can significantly enhance the performance of an LLM without strong external signals such as responses or preferences from GPT-4. It can outperform the model trained with iterative direct preference optimization (DPO), among other methods. SPPO is theoretically grounded, ensuring that the LLM can converge to the von Neumann winner (i.e., Nash equilibrium) under general, potentially intransitive preference, and empirically validated through extensive evaluations on multiple datasets.
+
+Paper Authors: [Yue Wu](https://yuewu.us/)\*, [Zhiqing Sun](https://www.cs.cmu.edu/~zhiqings/)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Yiming Yang](https://www.cs.cmu.edu/~yiming/), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
+
+verl Implementation Authors: [Yuhao Yang](https://github.com/yhyang201), [Chenyang Zhao](https://github.com/zhaochenyang20)
+
+[[Webpage](https://uclaml.github.io/SPPO/)] [[Huggingface](https://huggingface.co/papers/2405.00675)] [[Paper](https://arxiv.org/abs/2405.00675)][[Original Implementation](https://github.com/uclaml/SPPO)]
+
+## Reproduce the Experiment
+
+We evaluate the performance of SPPO on the MATH dataset. Starting from an initial score of 46.6 with Qwen2.5-7B-Instruct, we achieve a score of 65.6 after 20 epochs of training, placing our model approximately in the top 20 on the [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math). It's important to note that verl's internal evaluation metrics may not perfectly align with the official evaluation methodology for Qwen2.5-7B-Instruct. Therefore, for consistency and fair comparison, we report only the results based on verl's evaluation framework.
+
+```
+git clone git@github.com:volcengine/verl.git
+cd verl
+python3 -m uv pip install -e ".[sglang]"
+
+export WANDB_API_KEY=
+
+python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
+hf download Qwen/Qwen2.5-7B-Instruct --local-dir $HOME/models/Qwen2.5-7B-Instruct
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+bash recipe/sppo/run_qwen2.5-7b_rm.sh
+```
+
+Note that the installation would occasionally fail to install flash-attn. If this happens, you can install it manually by running:
+
+```bash
+python3 -m uv pip install wheel
+python3 -m uv pip install packaging
+python3 -m uv pip install flash-attn --no-build-isolation --no-deps
+```
+
+## Acknowledgement
+
+We sincerely thank the contribution and guidance from:
+
+- [Yue Wu](https://yuewu.us/)
+- [Chendong Wang](https://cdwang96.github.io/)
+- [Yifan Zhang](https://github.com/yifanzhang-pro)
+- [Yongan Xiang](https://github.com/BearBiscuit05)
+- [Junrong Lin](https://github.com/ocss884)
+- [Yuxuan Tong](https://github.com/tongyx361)
+- [Guangming Shen](https://github.com/PeterSH6)
+- [Biao He](https://www.linkedin.com/in/biao-he/)
+- [Qingquan Song](https://qingquansong.github.io/)
+- [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
diff --git a/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fc462c17fbd8aab8aa57456b73bcf35e5aec5394
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst
@@ -0,0 +1,796 @@
+Getting started with AMD (ROCM Kernel)
+=====================================================
+
+Last updated: 07/06/2025.
+
+Author: `Yusheng Su `_
+
+Setup
+-----
+
+If you run on AMD GPUs (MI300) with ROCM platform, you cannot use the previous quickstart to run verl. You should follow the following steps to build a docker and set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` or ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training.
+
+
+docker/Dockerfile.rocm
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04"
+
+ SHELL ["/bin/bash", "-ceuxo", "pipefail"]
+
+ ENV MAX_JOBS=512
+
+ ENV PATH="/usr/local/python3.12/bin:$PATH"
+ RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
+ ln -sf /usr/bin/pip3.12 /usr/bin/pip
+
+ ############################################
+ RUN apt-get update
+ RUN apt-get install -y pkg-config liblzma-dev
+ ############################################
+
+ ###########################################
+ ##########Install TransformerEngine########
+ ###########################################
+ WORKDIR /workspace/
+ # transformer-engine install
+ # https://github.com/ROCm/TransformerEngine
+ RUN rm -rf TransformerEngine
+ RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
+ WORKDIR /workspace/TransformerEngine
+ git checkout 236178e5
+ # git checkout bb061ade
+ # git checkout 864405c
+ ENV NVTE_FRAMEWORK=pytorch
+ ENV NVTE_ROCM_ARCH=gfx942
+ ENV NVTE_USE_HIPBLASLT=1
+ ENV NVTE_USE_ROCM=1
+ # export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
+ ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
+ RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv
+ WORKDIR /workspace/
+ ###########################################
+ ###########################################
+ ###########################################
+
+
+
+
+
+ ####################################################################################
+ ################Install vllm - sglang require vllm 0.6.7 dependency#################
+ ####################################################################################
+ #### Require vllm 0.6.7 - checkout 113274a0
+ WORKDIR /workspace/
+ RUN rm -rf vllm
+ RUN pip uninstall -y vllm
+ # Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
+ RUN git clone https://github.com/ROCm/vllm.git
+ # git clone https://github.com/vllm-project/vllm.git
+ WORKDIR /workspace/vllm
+ RUN git checkout 113274a0
+ ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+ #ENV MAX_JOBS=512
+ ENV MAX_JOBS=${MAX_JOBS}
+ RUN pip install "boto3>=1.26.0"
+ RUN pip install setuptools_scm
+ # will add src into py. You can delete the repo
+ RUN python3 setup.py install
+ WORKDIR /workspace/
+ ####################################################################################
+ ####################################################################################
+ ####################################################################################
+
+
+
+ ###########################################
+ ############For hack docker################
+ ###########################################
+ RUN pip install setuptools==75.8.0
+ ###########################################
+ ###########################################
+ ###########################################
+
+
+
+ ###########################################
+ ############build sgalng###################
+ ###########################################
+ # Set environment variables
+ ENV BASE_DIR=/sgl-workspace
+ ENV BUILD_TYPE=all
+ ENV SGL_REPO=https://github.com/sgl-project/sglang
+ ENV SGL_BRANCH=v0.4.6.post5
+ ENV TRITON_REPO=https://github.com/ROCm/triton.git
+ ENV TRITON_COMMIT=improve_fa_decode_3.0.0
+ ENV AITER_REPO=https://github.com/ROCm/aiter.git
+ ENV AITER_COMMIT=v0.1.2
+ # v0.1.2 version - commit id: 9d11f47
+ # ENV AITER_COMMIT=9d11f47
+ ENV HIP_FORCE_DEV_KERNARG=1
+ ENV HSA_NO_SCRATCH_RECLAIM=1
+ ENV SGLANG_SET_CPU_AFFINITY=1
+ ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+ ENV NCCL_MIN_NCHANNELS=112
+ ENV MOE_PADDING=1
+ ENV VLLM_FP8_PADDING=1
+ ENV VLLM_FP8_ACT_PADDING=1
+ ENV VLLM_FP8_WEIGHT_PADDING=1
+ ENV VLLM_FP8_REDUCE_CONV=1
+ ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+ ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+ ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+ ENV AMDGPU_TARGETS=gfx942
+ ENV ROCM_ARCH=gfx942
+ ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+ # Switch to working directory
+ WORKDIR /sgl-workspace
+ # Clean and create directory
+ RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
+
+ # Clone and build sglang
+ RUN git clone ${SGL_REPO} \
+ && cd sglang \
+ && git checkout ${SGL_BRANCH} || echo "Using default branch" \
+ && cd sgl-kernel \
+ && rm -f pyproject.toml \
+ && mv pyproject_rocm.toml pyproject.toml \
+ && python setup_rocm.py install \
+ && cd .. \
+ && if [ "$BUILD_TYPE" = "srt" ]; then \
+ python -m pip --no-cache-dir install -e "python[srt_hip]"; \
+ else \
+ python -m pip --no-cache-dir install -e "python[all_hip]"; \
+ fi \
+ && cd /sgl-workspace \
+ && cp -r /sgl-workspace/sglang /sglang \
+ && python -m pip cache purge
+
+ # Install common Python packages
+ RUN pip install IPython orjson python-multipart torchao pybind11
+ # Rebuild Triton
+ RUN pip uninstall -y triton || true \
+ && git clone ${TRITON_REPO} \
+ && cd triton \
+ && git checkout ${TRITON_COMMIT} \
+ && cd python \
+ && python3 setup.py install \
+ && cd /sgl-workspace
+ # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
+ # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+
+ # Build aiter
+ #version: Commit 9d11f47
+ # && git checkout ${AITER_COMMIT} \
+ RUN pip uninstall -y aiter || true
+ RUN git clone ${AITER_REPO} \
+ && cd aiter \
+ && git checkout ${AITER_COMMIT} \
+ && git submodule sync \
+ && git submodule update --init --recursive \
+ && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
+ && cd /sgl-workspace
+
+ # Copy MI300X config
+ RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+ /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+ -type f -name '*MI300X*' | \
+ xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+
+ # Environment setup complete.
+ RUN echo "Environment setup complete."
+
+ WORKDIR /workspace/
+ ###########################################
+ ###########################################
+ ###########################################
+
+
+
+
+
+
+ ###########################################
+ ###############vllm v0.8.5#################
+ ###########################################
+ WORKDIR /workspace/
+
+ ENV VLLM_TARGET_DEVICE=rocm
+ ENV ROCM_PATH=/opt/rocm
+ ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
+ # Find the repo path in: DockerFile/Dockerfile.rocm_yang
+ # RUN git clone https://github.com/RLFoundation/vllm-patch.git
+ RUN pip uninstall -y vllm || true
+ RUN rm -rf vllm-patch
+ RUN git clone https://github.com/RLFoundation/vllm-patch.git \
+ && cd vllm-patch \
+ && git checkout v0.8.5-sleep-numa \
+ && rm -rf build/ dist/ *.egg-info \
+ && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
+ && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
+ # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
+ WORKDIR /workspace/
+ ###########################################
+ ###########################################
+ ###########################################
+
+
+
+
+ #########################################
+ #### Install megatron-core###############
+ #########################################
+ RUN pip uninstall -y megatron-core && \
+ git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
+ cd Megatron-LM-amd_version && \
+ pip install -vvv -e . && \
+ cd /workspace/
+ #########################################
+ #########################################
+ #########################################
+
+
+
+
+ #######################################
+ ################apex###################
+ #######################################
+ WORKDIR /workspace/
+ RUN pip uninstall -y apex && \
+ git clone git@github.com:ROCm/apex.git && \
+ cd apex && \
+ python setup.py install && \
+ cd /workspace/
+ #######################################
+ #######################################
+ #######################################
+
+
+ ################################################################################
+ ###########################Add torch_memory_saver###############################
+ ################################################################################
+ # Set environment variables
+ ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
+ ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
+ ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
+ RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
+ ################################################################################
+ ################################################################################
+ ################################################################################
+
+
+
+ ########################################
+ ######Install ray#######################
+ ########################################
+ # need to add this patch: https://github.com/ray-project/ray/pull/53531/files
+ RUN pip uninstall ray -y
+ RUN pip install "ray[data,train,tune,serve]>=2.47.0"
+ ########################################
+ ########################################
+ ########################################
+
+
+ ##########################################
+ #######Install other dependencies#########
+ ##########################################
+ RUN pip install "tensordict==0.6.2" --no-deps && \
+ pip install accelerate \
+ codetiming \
+ datasets \
+ dill \
+ hydra-core \
+ liger-kernel \
+ numpy \
+ pandas \
+ peft \
+ "pyarrow>=15.0.0" \
+ pylatexenc \
+ torchdata \
+ wandb \
+ orjson \
+ pybind11
+
+ WORKDIR /workspace/
+ RUN git clone https://github.com/volcengine/verl.git && \
+ cd verl && \
+ pip install -e .
+ ##########################################
+ ##########################################
+ ##########################################
+
+ WORKDIR /workspace/
+ CMD ["/usr/bin/bash"]
+
+
+Build the image:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ docker docker/build -t verl-rocm .
+
+Run the container
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Note: You can pull the docker from this DockerHub: [RLSys Foundation](https://hub.docker.com/u/yushengsuthu)
+Pull the image:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4
+
+ docker tag rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 verl-rocm:latest
+
+Run the container
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+Optional: Running without root and with user permissions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+ docker run --rm -it \
+ --device /dev/dri \
+ --device /dev/kfd \
+ -p 8265:8265 \
+ --group-add video \
+ --cap-add SYS_PTRACE \
+ --security-opt seccomp=unconfined \
+ --privileged \
+ -v $HOME/.ssh:/root/.ssh \
+ -v $HOME:$HOME \
+ --shm-size 128G \
+ -w $PWD \
+ verl-rocm \
+ /bin/bash
+
+(Optional): If you do not want to root mode and require assign yourself as the user
+Please add ``-e HOST_UID=$(id -u)`` and ``-e HOST_GID=$(id -g)`` into the above docker launch script.
+
+Example
+-------
+
+Due to to special setting in AMD (ROCM) torch,
+1. If your ``ray>=2.45.0`` (default), you need to set ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training and add this [patch](https://github.com/ray-project/ray/pull/53531/files).
+2. If your ``ray<2.45.0``, you need to set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` when starting ray in verl's RLHF training.
+Inference ``$ENGINE`` can be ``vllm`` or ``sglang``. We choose ``vllm`` as default in the following examples.
+
+
+
+PPO
+~~~
+
+.. code-block:: bash
+
+ YOUR_PROJECT_NAME=r1-verl-ppo-upstream
+ YOUR_RUN_NAME=r1-training_ppo-upstream
+ # export HYDRA_FULL_ERROR=1
+
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+ # [ray] < 2.45.0
+ #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+ # [ray] >= 2.45.0
+ export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
+
+ GPUS_PER_NODE=8
+ MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
+ python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
+ python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
+ ENGINE=vllm #sglang
+
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+ data.train_files=data/gsm8k/train.parquet \
+ data.val_files=data/gsm8k/test.parquet \
+ data.train_batch_size=256 \
+ data.val_batch_size=1312 \
+ data.max_prompt_length=512 \
+ data.max_response_length=256 \
+ actor_rollout_ref.model.path=$MODEL_PATH \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=$ENGINE \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+ critic.optim.lr=1e-5 \
+ critic.model.path=$MODEL_PATH \
+ critic.ppo_micro_batch_size_per_gpu=4 \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ trainer.logger=console \
+ trainer.project_name=$YOUR_PROJECT_NAME \
+ trainer.experiment_name=$YOUR_RUN_NAME \
+ trainer.val_before_train=False \
+ trainer.n_gpus_per_node=$GPUS_PER_NODE \
+ trainer.nnodes=1 \
+ trainer.save_freq=10 \
+ trainer.test_freq=10 \
+ trainer.total_epochs=15 #2>&1 | tee verl_demo.log
+
+GRPO
+~~~~
+
+.. code-block:: bash
+
+ YOUR_PROJECT_NAME=r1-verl-grpo-upstream
+ YOUR_RUN_NAME=r1-training_grpo-upstream
+ # export HYDRA_FULL_ERROR=1
+ # export FSDP_VERBOSE=1
+
+ #export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+ # [ray] < 2.45.0
+ #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+ # [ray] >= 2.45.0
+ export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
+
+ GPUS_PER_NODE=8
+ MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
+ # MODEL_PATH=Qwen/Qwen2-7B-Instruct
+ python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
+ python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
+ ENGINE=vllm #sglang
+
+ python3 -m verl.trainer.main_ppo \
+ algorithm.adv_estimator=grpo \
+ data.train_files=data/gsm8k/train.parquet \
+ data.val_files=data/gsm8k/test.parquet \
+ data.train_batch_size=1024 \
+ data.val_batch_size=1312 \
+ data.max_prompt_length=512 \
+ data.max_response_length=1024 \
+ actor_rollout_ref.model.path=$MODEL_PATH \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
+ actor_rollout_ref.actor.use_kl_loss=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+ actor_rollout_ref.rollout.name=$ENGINE \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+ actor_rollout_ref.rollout.n=5 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=False \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ trainer.critic_warmup=0 \
+ trainer.logger=console \
+ trainer.project_name=$YOUR_PROJECT_NAME \
+ trainer.experiment_name=$YOUR_RUN_NAME \
+ trainer.n_gpus_per_node=$GPUS_PER_NODE \
+ trainer.val_before_train=False \
+ trainer.nnodes=1 \
+ trainer.save_freq=-1 \
+ trainer.test_freq=10 \
+ trainer.total_epochs=15
+
+
+
+Multi-node training: slurm with Docker/Podman container
+---------------------------------------------------------------------------------------
+
+If you want to run multi-node training with slurm, you can use the following script.
+
+.. note::
+ 1. You need to use ``podman`` or ``docker`` in the following script. We will release the apptainer script later.
+ 2. If you want to use ``podman``, you just replace ``docker`` with ``podman`` in the following script.
+
+The script includes the following steps:
+
+1. SLURM Configuration
+2. Environment Setup
+3. Docker/Podman Container Setup
+4. Ray Cluster Initialization
+5. Data Preprocessing
+6. Model Setup
+7. Training Launch
+
+
+slurm_script.sh
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ #!/bin/bash
+
+ #SBATCH --job-name=verl-ray-on-slurm
+ #SBATCH --nodes=2
+ #SBATCH --ntasks-per-node=2
+ #SBATCH --mem=200G
+ #SBATCH --time=30-00:00:00
+ #SBATCH --gpus-per-node=8
+ #SBATCH --cpus-per-task=28
+ #SBATCH --output=../verl_log/slurm-%j.out
+ #SBATCH --error=../verl_log/slurm-%j.err
+ #SBATCH --nodelist=gpu-[0,1]
+
+
+ # load necessary modules
+ ### Run this setup
+ # [Cluster]: Use docker
+ # docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+
+ ##########################################################################
+ ###The following setting should be set in different project and cluster###
+ ##########################################################################
+
+ ### Project
+ CONTAINER_NAME="multinode_verl_training"
+ IMG="verl.rocm"
+ DOCKERFILE="docker/Dockerfile.rocm"
+ # echo $PWD
+ verl_workdir="${HOME}/projects/verl_upstream"
+ export TRANSFORMERS_CACHE="${HOME}/.cache/huggingface"
+ export HF_HOME=$TRANSFORMERS_CACHE
+
+ ### Cluster Network Setting
+ export NCCL_DEBUG=TRACE
+ export GPU_MAX_HW_QUEUES=2
+ export TORCH_NCCL_HIGH_PRIORITY=1
+ export NCCL_CHECKS_DISABLE=1
+ # export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+ export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+ export NCCL_IB_GID_INDEX=3
+ export NCCL_CROSS_NIC=0
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_PROTO=Simple
+ export RCCL_MSCCL_ENABLE=0
+ export TOKENIZERS_PARALLELISM=false
+ export HSA_NO_SCRATCH_RECLAIM=1
+ ##########################################################################
+
+ ## Assign using GPUs
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+ ### For rocm and training script
+ # [ray] < 2.45.0
+ #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+ # [ray] >= 2.45.0
+ export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
+
+
+ # Build and launch the Docker container
+ srun bash -c "
+ # Exit on any error
+ set -e
+
+ # Clean up dangling images (images with tag)
+ docker image prune -f
+
+ # Need to pull the docker first
+ docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4
+
+ if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "${IMG}"; then
+ echo \"Building ${IMG} image...\"
+ docker build -f \"${DOCKERFILE}\" -t \"${IMG}\" .
+ else
+ echo \"${IMG} image already exists, skipping build\"
+ fi
+
+ # Removing old container if exists
+ docker rm \"${CONTAINER_NAME}\" 2>/dev/null || true
+
+ # Checking network devices
+ ibdev2netdev
+
+ # Launch the docker
+ docker run --rm -d \
+ -e HYDRA_FULL_ERROR=1 \
+ -e RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
+ -e RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 \
+ -e NCCL_DEBUG=${NCCL_DEBUG} \
+ -e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES} \
+ -e TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY} \
+ -e NCCL_CHECKS_DISABLE=${NCCL_CHECKS_DISABLE} \
+ -e NCCL_IB_HCA=${NCCL_IB_HCA} \
+ -e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX} \
+ -e NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \
+ -e CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS} \
+ -e NCCL_PROTO=${NCCL_PROTO} \
+ -e RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE} \
+ -e TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM} \
+ -e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM} \
+ -e TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE} \
+ -e HF_HOME=${HF_HOME} \
+ --network host \
+ --device /dev/dri \
+ --device /dev/kfd \
+ --device /dev/infiniband \
+ --group-add video \
+ --cap-add SYS_PTRACE \
+ --security-opt seccomp=unconfined \
+ --privileged \
+ -v \${HOME}:\${HOME} \
+ -v \${HOME}/.ssh:/root/.ssh \
+ -w "${verl_workdir}" \
+ --shm-size 128G \
+ --name \"${CONTAINER_NAME}\" \
+ \"${IMG}\" \
+ tail -f /dev/null
+
+ echo \"Container setup completed\"
+ "
+ # (Optional): If you do not want to root mode and require assign yuorself as the user
+ # Please add `-e HOST_UID=$(id -u)` and `-e HOST_GID=$(id -g)` into the above docker launch script.
+
+
+
+
+
+ ### Ray launch the nodes before training
+
+ # Getting the node names
+ nodes_array=($(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' '))
+
+ head_node=${nodes_array[0]}
+ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+ # if we detect a space character in the head node IP, we'll
+ # convert it to an ipv4 address. This step is optional.
+ if [[ "$head_node_ip" == *" "* ]]; then
+ IFS=' ' read -ra ADDR <<<"$head_node_ip"
+ if [[ ${#ADDR[0]} -gt 16 ]]; then
+ head_node_ip=${ADDR[1]}
+ else
+ head_node_ip=${ADDR[0]}
+ fi
+ echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+ fi
+
+ port=6379
+ ip_head=$head_node_ip:$port
+ export ip_head
+ echo "IP Head: $ip_head"
+
+ # make sure we set environment variables before Ray initialization
+
+ # Print out all env variables
+ printenv
+
+ echo "Starting HEAD at $head_node"
+ srun --nodes=1 --ntasks=1 -w "$head_node" \
+ docker exec "${CONTAINER_NAME}" \
+ ray start --head --node-ip-address="$head_node_ip" --port=$port \
+ --dashboard-port=8266 \
+ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
+ # optional, though may be useful in certain versions of Ray < 1.0.
+ sleep 10
+
+ # number of nodes other than the head node
+ worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+ for ((i = 1; i <= worker_num; i++)); do
+ node_i=${nodes_array[$i]}
+ echo "Debug: Starting worker on node_i = ${node_i}"
+ if [ -z "$node_i" ]; then
+ echo "Error: Empty node name for worker $i"
+ continue
+ fi
+ echo "Starting WORKER $i at $node_i"
+ srun --nodes=1 --ntasks=1 -w "$node_i" \
+ docker exec "${CONTAINER_NAME}" \
+ ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
+ sleep 5
+ done
+
+
+
+
+ # Ray initlization test (See whether any error in the above execution)
+ echo "Testing Ray initialization in the slurm nodes..."
+ docker exec "${CONTAINER_NAME}" python3 -c '
+ import ray
+ try:
+ ray.init(address="auto")
+ print("\n=== Ray Cluster Status ===")
+ print(f"Number of nodes: {len(ray.nodes())}")
+ for node in ray.nodes():
+ print("Node: {}, Status: {}".format(node["NodeManagerHostname"], node["Alive"]))
+ # print(f"Node: {node}")
+ ray.shutdown()
+ print("Ray initialization successful!")
+ except Exception as e:
+ print(f"Ray initialization failed: {str(e)}")
+ '
+ echo "=== Ray test completed ==="
+ ######
+
+
+
+ # Run data preprocessing
+
+ echo "Starting data preprocessing..."
+ docker exec "${CONTAINER_NAME}" \
+ python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"
+
+ echo "Starting data preprocessing..."
+ docker exec "${CONTAINER_NAME}" \
+ python3 "examples/data_preprocess/math_dataset.py" "--local_dir" "../data/math"
+
+ train_files="../data/gsm8k/train.parquet"
+ val_files="../data/gsm8k/test.parquet"
+
+ # Download and test model
+ echo "Loading model..."
+ docker exec "${CONTAINER_NAME}" \
+ python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
+ MODEL_PATH="Qwen/Qwen2-7B-Instruct"
+
+ # Set model path after pipeline test
+ MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct"
+
+ echo "== Data and model loading Done =="
+
+ echo "Start to train..."
+
+ docker exec "${CONTAINER_NAME}" \
+ python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
+ MODEL_PATH="Qwen/Qwen2-7B-Instruct"
+
+
+ PYTHONUNBUFFERED=1 srun --overlap --nodes=${SLURM_NNODES} --ntasks=1 -w "$head_node" \
+ docker exec "${CONTAINER_NAME}" \
+ python3 -m verl.trainer.main_ppo \
+ data.train_files=$train_files \
+ data.val_files=$val_files \
+ data.train_batch_size=1024 \
+ data.max_prompt_length=1024 \
+ data.max_response_length=1024 \
+ actor_rollout_ref.model.path=$MODEL_PATH \
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.model.path=$MODEL_PATH \
+ critic.model.enable_gradient_checkpointing=False \
+ critic.ppo_micro_batch_size_per_gpu=8 \
+ critic.model.fsdp_config.param_offload=False \
+ critic.model.fsdp_config.optimizer_offload=False \
+ algorithm.kl_ctrl.kl_coef=0.0001 \
+ trainer.critic_warmup=0 \
+ trainer.logger='["console","wandb"]' \
+ trainer.project_name='verl_example' \
+ trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \
+ trainer.n_gpus_per_node=${SLURM_GPUS_PER_NODE} \
+ trainer.val_before_train=False \
+ trainer.nnodes=${SLURM_NNODES} \
+ trainer.save_freq=-1 \
+ trainer.test_freq=10 \
+ trainer.total_epochs=15
+
+
+Run slurm_script.sh
+~~~~~~~~~~~~~~~~~~~~
+Just sbatch your slurm_script.sh
+
+.. code-block:: bash
+
+ sbatch slurm_script.sh
+
diff --git a/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7c230acab8792406e0ecb82d1a4fb417ba027a2e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst
@@ -0,0 +1,41 @@
+verl performance tuning for AMD (ROCm Kernel)
+=====================================================
+
+Last updated: 11/13/2025.
+
+Author: `Yang Wang `_, `Songlin Jiang `_
+
+Use vLLM Sleep Mode for AMD MI3xx series GPUs
+--------------------------------------------------------------
+
+By default, verl requires vLLM to enable sleep mode, which allows vLLM to offload GPU memory to CPU memory after rollout. This feature has been merged into the main branch of vLLM for version later than 0.11.0.
+
+For now, you can use the vLLM main branch and build it from the source code, or you can directly install vLLM from the pre-built ROCm wheels for vLLM version later than 0.11.0 when it's available.
+
+1. Clone the vLLM repository and build it with the following commands:
+
+.. code-block:: bash
+
+ git clone https://github.com/vllm-project/vllm.git
+ cd vllm
+ git reset --hard 4ca5cd5740c0cd7788cdfa8b7ec6a27335607a48 # You can also use a later commit as you wish
+ python -m pip install -r requirements/rocm.txt
+ VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ python3 setup.py develop
+
+2. Additionally, we recommend you to use the ROCm version later than or equal to ROCm 7.0.
+
+After the upgrade, you can verify whether sleep mode is working by trying out `these scripts `_.
+
+If sleep mode is working, you should see the memory usage reduce after sleep.
+
+After applying the vLLM patch and completing the installation, you can enable sleep mode in verl to reduce memory overhead. This allows verl to offload unused GPU memory during rollout, significantly lowering the memory footprint during long-context training or multi-node reinforcement learning.
+
+
+Enable CUDA Graph and Bypass ROCm-related issues
+--------------------------------------------------------------
+
+Due to potential issues with CUDA graph capture in ROCm, we've found that vLLM's CUDA graph feature cannot be enabled on multiple nodes in verl on AMD platforms with vLLM V1 mode. This leads to significantly slower rollout performance.
+
+Our investigation shows that ROCm may trigger an unexpected crash when attempting to capture large batches with CUDA graph. One workaround is to set ``actor_rollout_ref.rollout.cudagraph_capture_sizes`` to values such as ``[1, 2, 4, 8, 16, 32, 64]`` (change depending on your GPU memory size).
+
+Then, you can choose to enable CUDA graph by setting ``actor_rollout_ref.rollout.enforce_eager`` to ``False`` in your verl configuration file.
diff --git a/code/RL_model/verl/verl_train/docs/api/data.rst b/code/RL_model/verl/verl_train/docs/api/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5baa5b51bfdb79f6ead72f1f46141720248bd813
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/api/data.rst
@@ -0,0 +1,61 @@
+Data interface
+=========================
+
+Last updated: 05/19/2025 (API docstrings are auto-generated).
+
+DataProto is the interface for data exchange.
+
+The :class:`verl.DataProto` class contains two key members:
+
+- batch: a :class:`tensordict.TensorDict` object for the actual data
+- meta_info: a :class:`Dict` with additional meta information
+
+TensorDict
+~~~~~~~~~~~~
+
+:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
+A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
+
+.. code-block:: python
+
+ >>> import torch
+ >>> from tensordict import TensorDict
+ >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
+ >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
+ >>> zeros = tensordict["zeros"]
+ >>> tensordict
+ TensorDict(
+ fields={
+ ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
+ twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
+ zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
+ batch_size=torch.Size([2]),
+ device=None,
+ is_shared=False)
+
+One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
+
+.. code-block:: python
+
+ >>> tensordict[..., :1]
+ TensorDict(
+ fields={
+ ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
+ twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
+ zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
+ batch_size=torch.Size([1]),
+ device=None,
+ is_shared=False)
+ >>> tensordict = tensordict.to("cuda:0")
+ >>> tensordict = tensordict.reshape(6)
+
+For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
+
+.. _tensordict: https://pytorch.org/tensordict/stable/overview.html
+
+
+Core APIs
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: verl.DataProto
+ :members: to, select, union, make_iterator, concat
diff --git a/code/RL_model/verl/verl_train/docs/api/single_controller.rst b/code/RL_model/verl/verl_train/docs/api/single_controller.rst
new file mode 100644
index 0000000000000000000000000000000000000000..44ea366ffe4b12ce5293821877ce70a0073f2152
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/api/single_controller.rst
@@ -0,0 +1,30 @@
+Single Controller interface
+============================
+
+Last updated: 05/27/2025 (API docstrings are auto-generated).
+
+The Single Controller provides a unified interface for managing distributed workers
+using Ray or other backends and executing functions across them.
+It simplifies the process of dispatching tasks and collecting results, particularly
+when dealing with data parallelism or model parallelism.
+
+
+Core APIs
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: verl.single_controller.Worker
+ :members: __init__, __new__, get_master_addr_port, get_cuda_visible_devices, world_size, rank
+
+.. autoclass:: verl.single_controller.WorkerGroup
+ :members: __init__, world_size
+
+.. autoclass:: verl.single_controller.ClassWithInitArgs
+ :members: __init__, __call__
+
+.. autoclass:: verl.single_controller.ResourcePool
+ :members: __init__, world_size, local_world_size_list, local_rank_list
+
+.. autoclass:: verl.single_controller.ray.RayWorkerGroup
+ :members: __init__
+
+.. autofunction:: verl.single_controller.ray.create_colocated_worker_cls
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/api/trainer.rst b/code/RL_model/verl/verl_train/docs/api/trainer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..abfa51f01a31606f436a95fde13770577b9ab540
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/api/trainer.rst
@@ -0,0 +1,31 @@
+Trainer Interface
+================================
+
+Last updated: 06/08/2025 (API docstrings are auto-generated).
+
+Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged.
+
+.. autosummary::
+ :nosignatures:
+
+ verl.trainer.ppo.ray_trainer.RayPPOTrainer
+
+
+Core APIs
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
+ :members: __init__, init_workers, fit
+
+.. automodule:: verl.utils.tokenizer
+ :members: hf_tokenizer
+
+.. automodule:: verl.trainer.ppo.core_algos
+ :members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty
+
+.. automodule:: verl.trainer.ppo.reward
+ :members: load_reward_manager, compute_reward, compute_reward_async
+
+.. autoclass:: verl.workers.reward_manager.NaiveRewardManager
+
+.. autoclass:: verl.workers.reward_manager.DAPORewardManager
diff --git a/code/RL_model/verl/verl_train/docs/api/utils.rst b/code/RL_model/verl/verl_train/docs/api/utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e15e3a5a32bdbb129a25d93b12e751385caa30b5
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/api/utils.rst
@@ -0,0 +1,76 @@
+Utilities
+============
+
+Last updated: 05/19/2025 (API docstrings are auto-generated).
+
+This section documents the utility functions and classes in the VERL library.
+
+Python Functional Utilities
+------------------------------
+
+.. automodule:: verl.utils.py_functional
+ :members: append_to_dict
+
+File System Utilities
+------------------------
+
+.. automodule:: verl.utils.fs
+ :members: copy_to_local
+
+Tracking Utilities
+---------------------
+
+.. automodule:: verl.utils.tracking
+ :members: Tracking
+
+Metrics Utilities
+---------------------
+
+.. automodule:: verl.utils.metric
+ :members: reduce_metrics
+
+Checkpoint Management
+------------------------
+
+.. automodule:: verl.utils.checkpoint.checkpoint_manager
+ :members: find_latest_ckpt_path
+
+.. automodule:: verl.utils.checkpoint.fsdp_checkpoint_manager
+ :members: FSDPCheckpointManager
+
+Dataset Utilities
+---------------------
+
+.. automodule:: verl.utils.dataset.rl_dataset
+ :members: RLHFDataset, collate_fn
+
+Torch Functional Utilities
+-----------------------------
+
+.. automodule:: verl.utils.torch_functional
+ :members: get_constant_schedule_with_warmup, masked_whiten, masked_mean, logprobs_from_logits
+
+Sequence Length Balancing
+----------------------------
+
+.. automodule:: verl.utils.seqlen_balancing
+ :members: get_reverse_idx, rearrange_micro_batches
+
+Ulysses Utilities
+--------------------
+
+.. automodule:: verl.utils.ulysses
+ :members: gather_outputs_and_unpad, ulysses_pad_and_slice_inputs
+
+FSDP Utilities
+------------------
+
+.. automodule:: verl.utils.fsdp_utils
+ :members: get_fsdp_wrap_policy, get_init_weight_context_manager, init_fn, load_fsdp_model_to_gpu, load_fsdp_optimizer, offload_fsdp_model_to_cpu, offload_fsdp_optimizer,
+
+Debug Utilities
+-------------------
+
+.. automodule:: verl.utils.profiler
+ :members: log_gpu_memory_usage, GPUMemoryLogger
+
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst
new file mode 100644
index 0000000000000000000000000000000000000000..20aab3c7057fb70e6b2326f72dce4aeee4002703
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst
@@ -0,0 +1,50 @@
+Align the Inference results of the verl and vLLM frameworks on Ascend devices(zh)
+====================================
+
+在昇腾设备上对齐verl和vLLM两个框架下的推理结果。
+
+Last updated: 11/17/2025.
+
+这是一份在昇腾设备上对齐verl和vLLM两个框架下推理结果的教程。
+
+环境变量配置
+~~~~~~~~~~~~
+
+在多卡通信情况下:
+
+- HCCL通信下(默认场景):
+
+ - export CLOSE_MATMUL_K_SHIFT=1
+ - export ATB_MATMUL_SHUFFLE_K_ENABLE=0
+ - export HCCL_DETERMINISTIC="true"
+ - export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+- LCCL通信下(通过export HCCL_OP_EXPANSION_MODE="AIV"使能):
+
+ - export CLOSE_MATMUL_K_SHIFT=1
+ - export ATB_MATMUL_SHUFFLE_K_ENABLE=0
+ - export LCCL_DETERMINISTIC=1
+ - export ATB_LLM_LCOC_ENABLE=0
+ - export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+在单卡无通信情况下:
+
+- HCCL和LCCL通信下:
+
+ - export CLOSE_MATMUL_K_SHIFT=1
+ - export ATB_MATMUL_SHUFFLE_K_ENABLE=0
+ - export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+vLLM初始化参数
+~~~~~~~~~~~~
+
+需要对 SamplingParams 参数里单独设置seed, 保持vLLM和verl推理结果一致, 举例修改如下:
+
+.. code:: yaml
+
+ sampling_params = SamplingParams(n=1,
+ logprobs=0, # can be set to 0 and let actor to recompute
+ max_tokens=config.response_length,
+ repetition_penalty=config.get("repetition_penalty", 1.0),
+ seed=1234)
+
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aa9c9adc8fc001dc34c1e510abe993edaa7fe7fb
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst
@@ -0,0 +1,403 @@
+Performance data collection based on FSDP or MindSpeed(Megatron) on Ascend devices(en)
+==========================================================================================
+
+Last updated: 12/20/2025.
+
+This is a tutorial for data collection using the GRPO or DAPO algorithm
+based on FSDP or MindSpeed(Megatron) on Ascend devices.
+
+Configuration
+-------------
+
+Leverage two levels of configuration to control data collection:
+
+- **Global profiler control**: Use parameters in ``verl/trainer/config/ppo_trainer.yaml`` (FSDP) or ``verl/trainer/config/ppo_megatron_trainer.yaml`` (MindSpeed) to control the collection mode and steps.
+- **Role profile control**: Use parameters in each role's ``profile`` field to control various parameters.
+
+Global collection control
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use parameters in ppo_trainer.yaml to control the collection mode
+and steps.
+
+- global_profiler: Control the ranks and mode of profiling
+
+ - tool: The profiling tool to use, options are nsys, npu, torch,
+ torch_memory.
+ - steps: This parameter can be set as a list that has
+ collection steps, such as [2, 4], which means it will collect steps 2
+ and 4. If set to null, no collection occurs.
+ - save_path: The path to save the collected data. Default is
+ "outputs/profile".
+
+
+Role collection control
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In each role's ``profiler`` field, you can control the collection mode for that role.
+
+- enable: Whether to enable profiling for this role.
+- all_ranks: Whether to collect data from all ranks.
+- ranks: A list of ranks to collect data from. If empty, no data is collected.
+- tool_config: Configuration for the profiling tool used by this role.
+
+Use parameters in each role's ``profiler.tool_config.npu`` to control npu profiler behavior:
+
+- level: Collection level—options are level_none, level0, level1, and
+ level2
+
+ - level_none: Disables all level-based data collection (turns off profiler_level).
+ - level0: Collect high-level application data, underlying NPU data, and operator execution details on NPU. After balancing data volume and analytical capability, Level 0 is recommended as the default configuration.
+ - level1: Extends level0 by adding CANN-layer AscendCL data and AI Core performance metrics on NPU.
+ - level2: Extends level1 by adding CANN-layer Runtime data and AI CPU metrics.
+
+- contents: A list of options to control the collection content, such as
+ npu, cpu, memory, shapes, module, stack.
+
+ - npu: Whether to collect device-side performance data.
+ - cpu: Whether to collect host-side performance data.
+ - memory: Whether to enable memory analysis.
+ - shapes: Whether to record tensor shapes.
+ - module: Whether to record framework-layer Python call stack information. It is recommended to use 'module' instead of 'stack' for recording call stack information, as it costs less performance overhead.
+ - stack: Whether to record operator call stack information.
+
+- analysis: Enables automatic data parsing.
+- discrete: Whether to enable discrete mode.
+
+
+Examples
+--------
+
+Disabling collection
+~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ global_profiler:
+ steps: null # disable profile
+
+End-to-End collection
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ global_profiler:
+ steps: [1, 2, 5]
+ save_path: ./outputs/profile
+ actor_rollout_ref:
+ actor: # Set actor role profiler collection configuration parameters
+ profiler:
+ enable: True
+ all_ranks: True
+ tool_config:
+ npu:
+ discrete: False
+ contents: [npu, cpu] # Control collection list, default cpu, npu, can configure memory, shapes, module, etc.
+ # rollout & ref follow actor settings
+
+
+Discrete Mode Collection
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ global_profiler:
+ steps: [1, 2, 5]
+ save_path: ./outputs/profile
+ actor_rollout_ref:
+ actor:
+ profiler:
+ enable: True # Set to True to profile training
+ all_ranks: False
+ ranks: [0] # Global Rank 0
+ tool_config:
+ npu:
+ discrete: True
+ contents: [npu, cpu]
+ rollout:
+ profiler:
+ enable: True # Set to True to profile inference
+ all_ranks: False
+ ranks: [0] # In Agent Loop mode, this is the Replica Rank (e.g., 0-th instance)
+ tool_config:
+ npu:
+ discrete: True # Must be enabled in Agent Loop mode
+ # ref follow actor settings
+
+**Agent Loop Scenario Description**:
+
+When Rollout runs in `Agent Loop <../advance/agent_loop.rst>`_ mode, performance data for the Rollout phase **must be collected using discrete mode**. At this time, the Profiler is triggered by the inference engine backend.
+
+1. **Rank Meaning**: ``ranks`` in the Rollout config refers to the **Replica Rank** (instance index), not the global rank.
+2. **Inference Engine Setup**:
+
+ - **vLLM Engine**
+ - **Must be configured via environment variables**:
+ - ``VLLM_TORCH_PROFILER_DIR``: Directory to save traces (**Required**).
+ - ``VLLM_TORCH_PROFILER_WITH_STACK``: Control stack tracing (1: on, 0: off, default: on).
+ - ``VLLM_TORCH_PROFILER_RECORD_SHAPES``: Set to 1 to record shapes of operator inputs.
+ - ``VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY``: Set to 1 to track tensor memory allocation/free.
+ - ``VLLM_TORCH_PROFILER_WITH_FLOPS``: Set to 1 to estimate FLOPS.
+ - *Note: vLLM ignores the save_path and contents in yaml.*
+
+ - **SGLang Engine**
+ - **Zero Configuration**. Automatically reads configuration from ``ppo_trainer.yaml``.
+
+
+Visualization
+-------------
+
+Collected data is stored in the user-defined save_path and can be
+visualized by using the `MindStudio Insight `_ tool.
+
+Additionally, in a Linux environment, the MindStudio Insight tool is provided in the form of a `JupyterLab Plugin `_ ,offering a more intuitive and highly interactive user interface. The advantages of the JupyterLab plugin are as follows:
+
+- Seamless integration: Supports running the MindStudio Insight tool directly within the Jupyter environment, eliminating the need to switch platforms or copy data from the server, enabling data to be collected and used immediately.
+- Fast startup: Allows MindStudio Insight to be launched quickly via the JupyterLab command line or graphical interface.
+- Smooth operation: In a Linux environment, launching MindStudio Insight through JupyterLab effectively alleviates performance lag compared to the full-package communication mode, significantly improving the user experience.
+- Remote access: Supports remotely launching MindStudio Insight. Users can connect to the service via a local browser for direct visual analysis, reducing the difficulty of uploading and downloading data during large-model training or inference.
+
+If the analysis parameter is set to False, offline parsing is required after data collection:
+
+.. code:: python
+
+ import torch_npu
+ # Set profiler_path to the parent directory of the "localhost.localdomain___ascend_pt" folder
+ torch_npu.profiler.profiler.analyse(profiler_path=profiler_path)
+
+
+Advanced Guide: Fine-grained Collection
+---------------------------------------
+
+Background and Challenges
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Although the configuration-based collection method mentioned above is convenient, it faces challenges in training scenarios with **long sequences (Long Context)** or **large global batch sizes (Large Global Batch Size)**. Within a complete training step (Step), model computation exhibits high-frequency and repetitive characteristics:
+
+1. **Rollout phase**: Sequence generation (Generate Sequence) is an autoregressive process involving thousands of forward computations of the Decoder model.
+2. **Training phase**: To control peak memory usage, verl typically adopts a Micro-Batch strategy, dividing large data streams into multiple micro-batches for computation.
+
+ - **compute_log_prob (Actor/Ref)**: Involves multiple rounds of pure forward propagation.
+ - **update_policy (Actor/Critic)**: Involves multiple rounds of forward and backward propagation.
+
+This characteristic leads to massive and repetitive operator records from full profiling. As shown in the image below:
+
+.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png
+
+Even with ``discrete`` mode enabled, performance data files for a single stage can still reach several TB, leading to **parsing failures** or **visualization tool lag**.
+
+Solution: Critical Path Sampling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To solve the above problems, we can adopt a **critical path sampling** strategy: Based on the API interface provided by `torch_npu.profiler `_, directly modify Python source code to collect only representative data segments (such as specific Decode Steps or the first Micro-Batch).
+
+ **Important Notes**
+
+ 1. This chapter involves direct source code modification. It is recommended to back up files before modification and restore them after debugging.
+ 2. When using code instrumentation for collection, be sure to **disable global collection** (``global_profiler: steps: null``) in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yaml`` to avoid Profiler conflicts.
+
+1. Fine-grained Collection in Rollout Phase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For vLLM or SGLang inference engines, we can control the ``schedule`` parameter to collect model forward propagation performance data for specific tokens.
+
+**vLLM Engine**
+
+- **Reference Version**: vLLM v0.11.0, vLLM-Ascend v0.11.0rc1
+- **Modified File**: ``vllm-ascend/vllm_ascend/worker/worker_v1.py``
+
+.. code-block:: diff
+
+ class NPUWorker(WorkerBase):
+
+ def __init__(self, *args, **kwargs):
+ # ... existing code ...
+
+ + # Initialize profiler
+ + import torch_npu
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(
+ + profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ + export_type=torch_npu.profiler.ExportType.Db, # You can choose torch_npu.profiler.ExportType.Text format
+ + )
+ + self.profiler_npu = torch_npu.profiler.profile(
+ + activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU],
+ + with_modules=False, # Collect call stack
+ + profile_memory=False, # Collect memory
+ + experimental_config=experimental_config,
+ + # Skip first step, warmup one step, collect 3 steps, repeat 1 time. If you want to collect decode steps 30~70, set schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1)
+ + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True) # Data save path and whether to parse online
+ + )
+ + self.profiler_npu.start()
+
+ # ... existing code ...
+
+ def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs):
+ # ... existing code ...
+ output = self.model_runner.execute_model(scheduler_output,
+ intermediate_tensors)
+
+ + self.profiler_npu.step() # Drive schedule to collect partial decode steps
+
+ # ... existing code ...
+
+**SGLang Engine**
+
+- **Reference Version**: SGLang master branch
+- **Modified File**: ``sglang/python/sglang/srt/model_executor/model_runner.py``
+
+.. code-block:: diff
+
+ # ... existing imports ...
+ + import torch_npu
+
+ class ModelRunner:
+
+ def __init__(self, *args, **kwargs):
+ # ... existing init code ...
+
+ + # Initialize profiler (same configuration as above, omitted)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.profiler_npu = torch_npu.profiler.profile(
+ + # ...
+ + # Skip first step, warmup one step, collect 3 steps, repeat 1 time.
+ + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True)
+ + )
+ + self.profiler_npu.start()
+
+ def forward(self, forward_batch, **kwargs):
+ # ... existing code ...
+
+ + self.profiler_npu.step() # Drive schedule to collect partial decode steps
+ return output
+
+2. Fine-grained Collection in compute_log_prob (Actor & Ref) Phase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This phase computes probability distributions for new and old policies.
+
+**FSDP Backend**
+
+The FSDP backend allows fine-grained control at the Micro-Batch level.
+
+- **Modified File**: ``verl/workers/actor/dp_actor.py``
+
+.. code-block:: diff
+
+ # ... import dependencies ...
+ + import torch_npu
+
+ class DataParallelPPOActor(BasePPOActor):
+
+ def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor:
+
+ + role = "Ref" if self.actor_optimizer is None else "Actor"
+ + # Prepare profiler (same configuration as above, omitted)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.prof_npu = torch_npu.profiler.profile(
+ + # ...
+ + # wait=0, warmup=0, active=1: directly collect first micro-batch
+ + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True)
+ + )
+
+
+ + # This function is shared by ref and actor, set role flag to distinguish. If you want to collect actor_compute_log_prob, set if role=="Actor":
+ + if role=="Ref":
+ + self.prof_npu.start()
+
+ for micro_batch in micro_batches:
+
+ # ... original computation logic ...
+ with torch.no_grad():
+ entropy, log_probs = self._forward_micro_batch(...)
+
+ + # Drive schedule to collect micro batch
+ + if role=="Ref":
+ + self.prof_npu.step()
+
+ # ...
+
+
+**Megatron Backend**
+
+The Micro-Batch scheduling in the Megatron backend is managed internally by the framework and does not currently support fine-grained collection at the Micro-Batch level through simple code instrumentation. It is recommended to use global configuration for collection.
+
+3. Fine-grained Collection in update_policy (Actor & Critic) Phase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Update phase includes forward and backward propagation.
+
+**FSDP Backend**
+
+The FSDP backend supports collection at both Mini-Batch and Micro-Batch granularities.
+
+- **Modified File**: ``verl/workers/actor/dp_actor.py``
+
+.. code-block:: diff
+
+ # ... import dependencies ...
+ + import torch_npu
+
+ class DataParallelPPOActor(BasePPOActor):
+
+ def update_policy(self, data: DataProto):
+
+ + # Prepare profiler (same configuration as above, omitted)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.prof_npu = torch_npu.profiler.profile(
+ + # ...
+ + # Only collect first Mini Batch (including all Micro-Batch computations and one optimizer update)
+ + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True)
+ + )
+ + self.prof_npu.start()
+
+ # ... PPO Epochs loop ...
+ for _ in range(self.config.ppo_epochs):
+ # ... Mini Batch loop ...
+ for batch_idx, mini_batch in enumerate(mini_batches):
+ # ... mini_batches split ...
+
+ for i, micro_batch in enumerate(micro_batches):
+ # ... Original Forward & Backward logic ...
+ # ... loss.backward() ...
+ pass
+
+ grad_norm = self._optimizer_step()
+
+ + # Drive schedule to collect mini batch, if you want micro batch collection, move self.prof_npu.step() inside the micro_batch loop
+ + self.prof_npu.step()
+
+
+**Megatron Backend**
+
+The Megatron backend supports collection at the Mini-Batch granularity.
+
+- **Modified File**: ``verl/workers/actor/megatron_actor.py``
+
+.. code-block:: diff
+
+ class MegatronPPOActor(BasePPOActor):
+
+ def update_policy(self, dataloader: Iterable[DataProto]) -> dict:
+ # ...
+ + # Prepare profiler (same configuration as above, omitted)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.prof_npu = torch_npu.profiler.profile(
+ + # ...
+ + # Only collect computation of first Mini Batch (including all Micro-Batches) and one optimizer update
+ + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True)
+ + )
+ + self.prof_npu.start()
+
+ for data in dataloader:
+ # ... internally calls self.forward_backward_batch for computation ...
+ # ... metric_micro_batch = self.forward_backward_batch(...)
+
+ # ... self.actor_optimizer.step() ...
+
+ + # Drive schedule to collect mini batch
+ + self.prof_npu.step()
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f27f81bea2bb7543b8e21c2f7292e8842fe5b98
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst
@@ -0,0 +1,398 @@
+Performance data collection based on FSDP or MindSpeed(Megatron) on Ascend devices(zh)
+==================================================================================
+
+在昇腾设备上基于 FSDP 或 MindSpeed (Megatron) 后端进行性能数据采集
+----------------------------------------------------------------
+
+Last updated: 12/20/2025.
+
+这是一份在昇腾设备上基于FSDP或MindSpeed(Megatron)后端,使用GRPO或DAPO算法进行数据采集的教程。
+
+配置
+----
+
+使用两级profile设置来控制数据采集
+
+- 全局采集控制:使用verl/trainer/config/ppo_trainer.yaml(FSDP),或verl/trainer/config/ppo_megatron_trainer.yaml(MindSpeed)中的配置项控制采集的模式和步数。
+- 角色profile控制:通过每个角色中的配置项控制等参数。
+
+全局采集控制
+~~~~~~~~~~~~
+
+通过 ppo_trainer.yaml 中的参数控制采集步数和模式:
+
+- global_profiler: 控制采集的rank和模式
+
+ - tool: 使用的采集工具,选项有 nsys、npu、torch、torch_memory。
+ - steps: 此参数可以设置为包含采集步数的列表,例如 [2, 4],表示将采集第2步和第4步。如果设置为 null,则不进行采集。
+ - save_path: 保存采集数据的路径。默认值为 "outputs/profile"。
+
+角色profiler控制
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+在每个角色的 ``profiler`` 字段中,您可以控制该角色的采集模式。
+
+- enable: 是否为此角色启用性能分析。
+- all_ranks: 是否从所有rank收集数据。
+- ranks: 要收集数据的rank列表。如果为空,则不收集数据。
+- tool_config: 此角色使用的性能分析工具的配置。
+
+通过每个角色的 ``profiler.tool_config.npu`` 中的参数控制具体采集行为:
+
+- level: 采集级别—选项有 level_none、level0、level1 和 level2
+
+ - level_none: 禁用所有基于级别的数据采集(关闭 profiler_level)。
+ - level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。在权衡数据量和分析能力后,level0是推荐的默认配置。
+ - level1: 在level0基础上增加CANN层AscendCL数据和NPU上的AI Core性能指标。
+ - level2: 在level1基础上增加CANN层Runtime数据和AI CPU指标。
+
+- contents: 控制采集内容的选项列表,例如
+ npu、cpu、memory、shapes、module、stack。
+
+ - npu: 是否采集设备端性能数据。
+ - cpu: 是否采集主机端性能数据。
+ - memory: 是否启用内存分析。
+ - shapes: 是否记录张量形状。
+ - module: 是否记录框架层Python调用栈信息。相较于stack,更推荐使用module记录调用栈信息,因其产生的性能膨胀更低。
+ - stack: 是否记录算子调用栈信息。
+
+- analysis: 启用自动数据解析。
+- discrete: 使用离散模式。
+
+示例
+----
+
+禁用采集
+~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ global_profiler:
+ steps: null # disable profile
+
+端到端采集
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ global_profiler:
+ steps: [1, 2, 5]
+ save_path: ./outputs/profile
+ actor_rollout_ref:
+ actor: # 设置 actor role 的 profiler 采集配置参数
+ profiler:
+ enable: True
+ all_ranks: True
+ tool_config:
+ npu:
+ discrete: False
+ contents: [npu, cpu] # 控制采集列表,默认cpu、npu,可配置memory、shapes、module等
+
+ # rollout & ref follow actor settings
+
+
+离散模式采集
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ global_profiler:
+ steps: [1, 2, 5]
+ save_path: ./outputs/profile
+ actor_rollout_ref:
+ actor:
+ profiler:
+ enable: True # 设置为 True 以采集训练阶段
+ all_ranks: False
+ ranks: [0] # 全局 Rank 0
+ tool_config:
+ npu:
+ discrete: True
+ contents: [npu, cpu]
+ rollout:
+ profiler:
+ enable: True # 设置为 True 以采集推理阶段
+ all_ranks: False
+ ranks: [0] # 在 Agent Loop 模式下,此处指推理实例的 Replica Rank (例如第 0 个实例)
+ tool_config:
+ npu:
+ discrete: True # Agent Loop 模式下必须开启离散模式
+ # ref follow actor settings
+
+**Agent Loop 场景说明**:
+
+当 Rollout 运行在 `Agent Loop <../advance/agent_loop.rst>`_ 模式时,Rollout 阶段的性能数据 **必须使用离散模式** 采集。此时 Profiler 由推理引擎后端触发,配置要求如下:
+
+1. **Rank 含义**:Rollout 配置中的 ``ranks`` 指代 **Replica Rank**(实例索引),而非全局 Rank。
+2. **推理引擎配置**:
+
+ - **vLLM 引擎**
+ - **必须通过环境变量配置**:
+ - ``VLLM_TORCH_PROFILER_DIR``: 设置数据保存路径(**必选**)。
+ - ``VLLM_TORCH_PROFILER_WITH_STACK``: 是否记录调用栈 (1开启, 0关闭,默认开启)。
+ - ``VLLM_TORCH_PROFILER_RECORD_SHAPES``: 设置为 1 以记录形状。
+ - ``VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY``: 设置为 1 以记录内存。
+ - ``VLLM_TORCH_PROFILER_WITH_FLOPS``: 设置为 1 以估算 FLOPS。
+ - *注意:vLLM 会忽略 yaml 中的 save_path 和 contents。*
+
+ - **SGLang 引擎**
+ - **零配置**。自动读取 ``ppo_trainer.yaml`` 中的配置。
+
+
+可视化
+------
+
+采集后的数据存放在用户设置的save_path下,可通过 `MindStudio Insight `_ 工具进行可视化。
+
+另外在Linux环境下,MindStudio Insight工具提供了 `JupyterLab插件 `_ 形态,提供更直观和交互式强的操作界面。JupyterLab插件优势如下:
+
+- 无缝集成:支持在Jupyter环境中直接运行MindStudio Insight工具,无需切换平台,无需拷贝服务器上的数据,实现数据即采即用。
+- 快速启动:通过JupyterLab的命令行或图形界面,可快速启动MindStudio Insight工具。
+- 运行流畅:在Linux环境下,通过JupyterLab环境启动MindStudio Insight,相较于整包通信,有效解决了运行卡顿问题,操作体验显著提升。
+- 远程访问:支持远程启动MindStudio Insight,可通过本地浏览器远程连接服务直接进行可视化分析,缓解了大模型训练或推理数据上传和下载的困难。
+
+如果analysis参数设置为False,采集之后需要进行离线解析:
+
+.. code:: python
+
+ import torch_npu
+ # profiler_path请设置为"localhost.localdomain___ascend_pt"目录的上一级目录
+ torch_npu.profiler.profiler.analyse(profiler_path=profiler_path)
+
+
+进阶指南:精细化采集
+--------------------
+
+背景与挑战
+~~~~~~~~~~
+
+上述基于配置文件的采集方式虽然便捷,但在 **长序列 (Long Context)** 或 **大全局批量 (Large Global Batch Size)** 的训练场景中面临挑战。
+在一个完整的训练步 (Step) 内,模型计算呈现出高频次、重复性的特征:
+
+1. Rollout 阶段:序列生成 (Generate Sequence) 是一个自回归过程,涉及成千上万次 Decoder 模型的前向计算。
+2. Training 阶段:为了控制显存峰值,verl 通常采用 Micro-Batch 策略,将庞大的数据流切分为多个微批次进行计算。
+
+ - compute_log_prob (Actor/Ref):涉及多轮纯前向传播。
+ - update_policy (Actor/Critic):涉及多轮前向与反向传播。
+
+这种特性会导致全量 Profiling 产生海量且重复的算子记录。如下图所示:
+
+.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png
+
+即使使用了 ``discrete`` 模式,单个阶段的性能数据文件仍可能达到数 TB,导致 **解析失败** 或 **可视化工具卡顿** 。
+
+解决方案:关键路径采样
+~~~~~~~~~~~~~~~~~~~~~~
+
+为了解决上述问题,我们可以采用 **关键路径采样** 策略:基于 `torch_npu.profiler `_ 提供的API接口,直接修改 Python 源码,仅采集具有代表性的数据片段(如特定 Decode Step 或首个 Micro-Batch)。
+
+ **重要提示**
+
+ 1. 本章节涉及直接修改源码。建议修改前备份文件,调试完成后恢复。
+ 2. 使用代码插桩采集时,请务必在 ``ppo_trainer.yaml`` 或 ``ppo_megatron_trainer.yaml`` 中**禁用全局采集** (``global_profiler: steps: null``),以避免 Profiler 冲突。
+
+1. Rollout 阶段精细化采集
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+对于 vLLM 或 SGLang 推理引擎,我们可以通过控制 ``schedule`` 参数来控制采集模型在特定token的前向传播性能数据。
+
+**vLLM 引擎**
+
+- **参考版本**:vLLM v0.11.0, vLLM-Ascend v0.11.0rc1
+- **修改文件**:``vllm-ascend/vllm_ascend/worker/worker_v1.py``
+
+.. code-block:: diff
+
+ class NPUWorker(WorkerBase):
+
+ def __init__(self, *args, **kwargs):
+ # ... existing code ...
+
+ + # Initialize profiler
+ + import torch_npu
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(
+ + profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ + export_type=torch_npu.profiler.ExportType.Db, # 可选择torch_npu.profiler.ExportType.Text格式
+ + )
+ + self.profiler_npu = torch_npu.profiler.profile(
+ + activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU],
+ + with_modules=False, # 采集调用栈
+ + profile_memory=False, # 采集内存
+ + experimental_config=experimental_config,
+ + # 跳过第一步,warmup一步,采集3步,重复1次。如果想采集第30~70个decode step,可以设置为schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1)
+ + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True) # 采集数据保存路径,是否在线解析
+ + )
+ + self.profiler_npu.start()
+
+ # ... existing code ...
+
+ def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs):
+ # ... existing code ...
+ output = self.model_runner.execute_model(scheduler_output,
+ intermediate_tensors)
+
+ + self.profiler_npu.step() # 驱动 schedule,对部分decode step进行采集
+
+ # ... existing code ...
+
+**SGLang 引擎**
+
+- **参考版本**:SGLang master 分支
+- **修改文件**:``sglang/python/sglang/srt/model_executor/model_runner.py``
+
+.. code-block:: diff
+
+ # ... existing imports ...
+ + import torch_npu
+
+ class ModelRunner:
+
+ def __init__(self, *args, **kwargs):
+ # ... existing init code ...
+
+ + # Initialize profiler (配置同上,略)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.profiler_npu = torch_npu.profiler.profile(
+ + # ...
+ + # 跳过第一步,warmup一步,采集3步,重复1次。
+ + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True)
+ + )
+ + self.profiler_npu.start()
+
+ def forward(self, forward_batch, **kwargs):
+ # ... existing code ...
+
+ + self.profiler_npu.step() # 驱动 schedule,对部分decode step进行采集
+ return output
+
+2. compute_log_prob (Actor & Ref) 阶段精细化采集
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+该阶段计算新旧策略的概率分布。
+
+**FSDP 后端**
+
+FSDP 后端允许在 Micro-Batch 级别进行精细控制。
+
+- **修改文件**:``verl/workers/actor/dp_actor.py``
+
+.. code-block:: diff
+
+ # ... 引入依赖 ...
+ + import torch_npu
+
+ class DataParallelPPOActor(BasePPOActor):
+
+ def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor:
+
+ + role = "Ref" if self.actor_optimizer is None else "Actor"
+ + # 准备 profiler (配置同上,略)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.prof_npu = torch_npu.profiler.profile(
+ + # ...
+ + # wait=0, warmup=0, active=1: 直接采集第一个 micro-batch
+ + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True)
+ + )
+
+
+ + # 此函数ref和actor共用,设置role标志位来区分。如果想采集actor_compute_log_prob,可设置if role=="Actor":
+ + if role=="Ref":
+ + self.prof_npu.start()
+
+ for micro_batch in micro_batches:
+
+ # ... 原始计算逻辑 ...
+ with torch.no_grad():
+ entropy, log_probs = self._forward_micro_batch(...)
+
+ + # 驱动 schedule,对micro batch进行采集
+ + if role=="Ref":
+ + self.prof_npu.step()
+
+ # ...
+
+
+**Megatron 后端**
+
+Megatron 后端的 Micro-Batch 调度由框架内部管理,暂不支持通过简单的代码插桩进行 Micro-Batch 级别的精细化采集。建议使用全局配置进行采集。
+
+3. update_policy (Actor & Critic) 阶段精细化采集
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Update 阶段包含前向和反向传播。
+
+**FSDP 后端**
+
+FSDP 后端支持设置对 Mini-Batch 和 Micro-Batch 的粒度进行采集。
+
+- **修改文件**:``verl/workers/actor/dp_actor.py``
+
+.. code-block:: diff
+
+ # ... 引入依赖 ...
+ + import torch_npu
+
+ class DataParallelPPOActor(BasePPOActor):
+
+ def update_policy(self, data: DataProto):
+
+ + # 准备 profiler (配置同上,略)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.prof_npu = torch_npu.profiler.profile(
+ + # ...
+ + # 仅采集第一个 Mini Batch(包含所有 Micro-Batch 的计算和一次优化器更新)
+ + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True)
+ + )
+ + self.prof_npu.start()
+
+ # ... PPO Epochs 循环 ...
+ for _ in range(self.config.ppo_epochs):
+ # ... Mini Batch 循环 ...
+ for batch_idx, mini_batch in enumerate(mini_batches):
+ # ... mini_batches 切分 ...
+
+ for i, micro_batch in enumerate(micro_batches):
+ # ... 原始 Forward & Backward 逻辑 ...
+ # ... loss.backward() ...
+ pass
+
+ grad_norm = self._optimizer_step()
+
+ + # 驱动 schedule,对mini batch进行采集,如果想对micro batch进行,则将self.prof_npu.step()移动到micro_batch的循环内
+ + self.prof_npu.step()
+
+
+**Megatron 后端**
+
+Megatron 后端支持以 Mini-Batch 的粒度进行采集。
+
+- **修改文件**:``verl/workers/actor/megatron_actor.py``
+
+.. code-block:: diff
+
+ class MegatronPPOActor(BasePPOActor):
+
+ def update_policy(self, dataloader: Iterable[DataProto]) -> dict:
+ # ...
+ + # 准备 profiler (配置同上,略)
+ + experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+ + self.prof_npu = torch_npu.profiler.profile(
+ + # ...
+ + # 仅采集第一个 Mini Batch 的计算(含所有 Micro-Batch)和一次优化器更新
+ + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+ + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True)
+ + )
+ + self.prof_npu.start()
+
+ for data in dataloader:
+ # ... 内部会调用 self.forward_backward_batch 进行计算 ...
+ # ... metric_micro_batch = self.forward_backward_batch(...)
+
+ # ... self.actor_optimizer.step() ...
+
+ + # 驱动 schedule,对mini batch进行采集
+ + self.prof_npu.step()
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1fa607befe48e402ca8c4f7dd03549ef5830ef4f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst
@@ -0,0 +1,289 @@
+Ascend Quickstart
+===================================
+
+Last updated: 12/11/2025.
+
+我们在 verl 上增加对华为昇腾设备的支持。
+
+
+关键更新
+----------------------------------
+
+2025/12/11:verl 存量场景目前支持自动识别 NPU 设备类型, GPU 脚本在昇腾上运行,原则上不再需要显式设置 trainer.device=npu 参数,新增特性通过设置 trainer.device 仍可优先使用,逐步适配自动识别能力。
+
+ [说明] 自动识别 NPU 设备类型的前提,是运行程序所在环境包含 torch_npu 软件包。如不包含该软件包,仍需显式指定 trainer.device=npu 参数。
+
+硬件支持
+-----------------------------------
+
+Atlas 200T A2 Box16
+
+Atlas 900 A2 PODc
+
+Atlas 800T A3
+
+
+安装流程
+-----------------------------------
+
+
+DockerFile镜像构建 & 使用
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+如需要通过 DockerFile 构建镜像,或希望使用基于 verl 构建的镜像,请参考 `文档 `_ 。
+
+
+安装基础环境
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. 基础环境涉及以下软件包,请参考 `文档 `_ 安装。
+
+ +---------------+----------------------+
+ | software | version |
+ +---------------+----------------------+
+ | Python | >= 3.10, <3.12 |
+ +---------------+----------------------+
+ | CANN | == 8.3.RC1 |
+ +---------------+----------------------+
+ | torch | == 2.7.1 |
+ +---------------+----------------------+
+ | torch_npu | == 2.7.1 |
+ +---------------+----------------------+
+
+2. (可选)在 x86 平台安装时,pip 需要配置额外的源,指令如下:
+
+ .. code-block:: bash
+
+ pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"
+
+
+安装其他软件包
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+基础环境准备完毕后,需要通过指令安装以下软件包:
+
+ +---------------+----------------------+
+ | torchvision | == 0.22.1 |
+ +---------------+----------------------+
+ | triton-ascend | == 3.2.0rc4 |
+ +---------------+----------------------+
+ | transformers | latest release |
+ +---------------+----------------------+
+
+ 安装指令:
+
+ .. code-block:: bash
+
+ # 安装torchvision,版本需要和torch匹配
+ pip install torchvision==0.22.1
+
+ # 清理环境上可能存在的历史triton/triton-ascend软件包残留
+ pip uninstall -y triton triton-ascend
+
+ # 安装triton-ascend,不需要单独安装triton
+ pip install triton-ascend==3.2.0rc4
+
+
+安装 vllm & vllm-ascend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. 需确保CANN ascend-toolkit 和 nnal 环境变量被激活,对于CANN默认安装路径 /usr/local/Ascend 而言,激活指令如下:
+
+ .. code-block::
+
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh
+ source /usr/local/Ascend/nnal/atb/set_env.sh
+
+2. vllm 源码安装指令:
+
+ .. code-block:: bash
+
+ git clone --depth 1 --branch v0.11.0 https://github.com/vllm-project/vllm.git
+ cd vllm && VLLM_TARGET_DEVICE=empty pip install -v -e . && cd ..
+
+3. vllm-ascend 源码安装指令:
+
+ .. code-block:: bash
+
+ git clone --depth 1 --branch v0.11.0rc1 https://github.com/vllm-project/vllm-ascend.git
+ cd vllm-ascend && pip install -v -e . && cd ..
+
+
+安装 MindSpeed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+MindSpeed 源码安装指令:
+
+ .. code-block:: bash
+
+ # 下载 MindSpeed,切换到指定commit-id,并下载 Megatron-LM
+ git clone https://gitcode.com/Ascend/MindSpeed.git
+ cd MindSpeed && git checkout f2b0977e && cd ..
+ git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
+
+ # 安装 MindSpeed & Megatron
+ pip install -e MindSpeed
+
+ # 将 Megatron-LM 源码路径配置到 PYTHONPATH 环境变量中
+ export PYTHONPATH=$PYTHONPATH:"$(pwd)/Megatron-LM"
+
+ # (可选)如希望 shell 关闭,或系统重启后,PYTHONPATH 环境变量仍然生效,建议将它添加到 .bashrc 配置文件中
+ echo "export PYTHONPATH=$PYTHONPATH:\"$(pwd)/Megatron-LM\"" >> ~/.bashrc
+
+ # 安装 mbridge
+ pip install mbridge
+
+MindSpeed 对应 Megatron-LM 后端使用场景,使用方式如下:
+
+ 1. 使能 verl worker 模型 ``strategy`` 配置为 ``megatron`` ,例如 ``actor_rollout_ref.actor.strategy=megatron``。
+
+ 2. MindSpeed 自定义入参可通过 ``override_transformer_config`` 参数传入,例如对 actor 模型开启 FA 特性可使用 ``+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True``。
+
+ 3. 更多特性信息可参考 `MindSpeed & verl 文档 `_ 。
+
+
+安装verl
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+ git clone --depth 1 https://github.com/volcengine/verl.git
+ cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd ..
+
+
+昇腾暂不支持生态库说明
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+verl 中昇腾暂不支持生态库如下:
+
+ +---------------+----------------+
+ | software | description |
+ +---------------+----------------+
+ | flash_attn | not supported |
+ +---------------+----------------+
+ | liger-kernel | not supported |
+ +---------------+----------------+
+
+ 1. 不支持通过 flash_attn 使能 flash attention 加速,支持通过 transformers 使用。
+ 2. 不支持 liger-kernel 使能。
+
+
+快速开始
+-----------------------------------
+正式使用前,建议您通过对Qwen2.5-0.5B GRPO的训练尝试以检验环境准备和安装的正确性。
+
+1.下载数据集并将数据集预处理为parquet格式,以便包含计算RL奖励所需的必要字段
+
+ .. code-block:: bash
+
+ python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
+
+2.执行训练
+
+ .. code-block:: bash
+
+ set -x
+
+ export VLLM_ATTENTION_BACKEND=XFORMERS
+
+ python3 -m verl.trainer.main_ppo \
+ algorithm.adv_estimator=grpo \
+ data.train_files=$HOME/data/gsm8k/train.parquet \
+ data.val_files=$HOME/data/gsm8k/test.parquet \
+ data.train_batch_size=128 \
+ data.max_prompt_length=512 \
+ data.max_response_length=128 \
+ data.filter_overlong_prompts=True \
+ data.truncation='error' \
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+ actor_rollout_ref.actor.optim.lr=5e-7 \
+ actor_rollout_ref.model.use_remove_padding=False \
+ actor_rollout_ref.actor.entropy_coeff=0.001 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \
+ actor_rollout_ref.actor.use_kl_loss=True \
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+ actor_rollout_ref.rollout.n=5 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ trainer.critic_warmup=0 \
+ trainer.logger=console \
+ trainer.project_name='verl_grpo_example_gsm8k' \
+ trainer.experiment_name='qwen2_7b_function_rm' \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=-1 \
+ trainer.test_freq=5 \
+ trainer.total_epochs=1 $@
+
+
+
+算法支持现状
+-----------------------------------
+
+**表1** RL类算法
+
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | algorithm | model | download link | actor.strategy | rollout.name | shell location | hardware |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen2.5-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2_5_7b_grpo_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen2.5-32B-instruct |`32B `_ | FSDP | vllm-ascend |`qwen2_5_32b_grpo_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen2.5-VL-3B-instruct |`3B `_ | FSDP | vllm-ascend |`qwen2_5_vl_3b_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen2.5-VL-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2_5_vl_7b_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen2.5-VL-32B-instruct |`32B `_ | FSDP | vllm-ascend |`qwen2_5_vl_32b_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen3-4B |`4B `_ | FSDP | vllm-ascend |`qwen3-4B_npu `_ | Atlas 800T A3 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen3-8B |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_vllm_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen3-8B |`8B `_ | FSDP | sglang |`qwen3_8b_sglang_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | Qwen3-32B |`32B `_ | FSDP | vllm-ascend |`qwen3-32B_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | GRPO | DeepSeekv3-671B |`671B `_ | Megatron | vllm-ascend |`deepseek_v3_megatron_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | DAPO | Qwen2.5-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2.5_7b_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | DAPO | Qwen2.5-32B |`32B `_ | FSDP | vllm-ascend |`qwen2.5_32b_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | DAPO | Qwen3-8B-base |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | DAPO | Qwen3-14B-base |`14B `_ | FSDP | vllm-ascend |`qwen3_14b_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | DAPO | Qwen3-30B-A3B-base |`30B `_ | FSDP | vllm-ascend |`qwen3_30b_fsdp_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | DAPO | Qwen3-30B-A3B-base |`30B `_ | Megatron | vllm-ascend |`qwen3_30b_megatron_npu `_ | Atlas 200T A2 Box16 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | PPO | Qwen3-8B |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_ppo_npu `_ | Atlas 900 A2 PODc |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+ | One_Step_Off_Policy | Qwen3-8B |`8B `_ | FSDP2 | vllm-ascend |`qwen3_8b_fsdp2_npu `_ | Atlas 800T A3 |
+ +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+
+**表2** SFT类算法
+
+ +-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+ | algorithm | model | download link | actor.strategy | shell location | hardware |
+ +-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+ | SFT-PEFT | Qwen3-8B |`8B `_ | FSDP |`sft_peft_sp2_npu `_ | Atlas 900 A2 PODc |
+ +-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+ | ReTool-SFT| Qwen2-7B-instruct |`7B `_ | FSDP |`qwen2_7b_sft_npu `_ | Atlas 900 A2 PODc |
+ +-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+
+
+声明
+-----------------------------------
+verl中提供的ascend支持代码、Dockerfile、镜像皆为参考样例,如在生产环境中使用请通过官方正式途径沟通,谢谢。
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8b1661cbbe4e6fc0b2eba6aeacc485dc8be7d99a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst
@@ -0,0 +1,153 @@
+Ascend Quickstart with SGLang Backend
+===================================
+
+Last updated: 01/27/2026.
+
+我们在 verl 上增加对华为昇腾设备的支持。
+
+硬件支持
+-----------------------------------
+
+Atlas 200T A2 Box16
+
+Atlas 900 A2 PODc
+
+Atlas 800T A3
+
+
+安装
+-----------------------------------
+关键支持版本
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++-----------+-----------------+
+| software | version |
++===========+=================+
+| Python | == 3.11 |
++-----------+-----------------+
+| HDK | >= 25.3.RC1 |
++-----------+-----------------+
+| CANN | >= 8.3.RC1 |
++-----------+-----------------+
+| torch | >= 2.7.1 |
++-----------+-----------------+
+| torch_npu | >= 2.7.1.post2 |
++-----------+-----------------+
+| sglang | v0.5.8 |
++-----------+-----------------+
+
+从 Docker 镜像进行安装
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+我们提供了DockerFile进行构建,详见 `dockerfile_build_guidance `_ ,请根据设备自行选择对应构建文件
+
+从自定义环境安装
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**1. 安装HDK&CANN依赖并激活**
+
+异构计算架构CANN(Compute Architecture for Neural Networks)是昇腾针对AI场景推出的异构计算架构, 为了使训练和推理引擎能够利用更好、更快的硬件支持, 我们需要安装以下 `先决条件 `_
+
++-----------+-------------+
+| HDK | >= 25.3.RC1 |
++-----------+-------------+
+| CANN | >= 8.3.RC1 |
++-----------+-------------+
+安装完成后请激活环境
+
+.. code-block:: bash
+
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh
+ source /usr/local/Ascend/nnal/atb/set_env.sh
+
+**2. 创建conda环境**
+
+.. code-block:: bash
+
+ # create conda env
+ conda create -n verl-sglang python==3.11
+ conda activate verl-sglang
+
+**3. 然后,执行我们在 verl 中提供的脚本** `install_sglang_mcore_npu.sh `_
+
+如果在此步骤中遇到错误,请检查脚本并手动按照脚本中的步骤操作。
+
+.. code-block:: bash
+
+ git clone https://github.com/volcengine/verl.git
+ # Make sure you have activated verl conda env
+ # NPU_DEVICE=A3 or A2 depends on your device
+ NPU_DEVICE=A3 bash verl/scripts/install_sglang_mcore_npu.sh
+
+**4. 安装verl**
+
+.. code-block:: bash
+
+ cd verl
+ pip install --no-deps -e .
+ pip install -r requirements-npu.txt
+
+
+快速开始
+-----------------------------------
+
+**1.当前NPU sglang脚本一览**
+
+.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
+.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh
+.. _Qwen3-8B-1k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
+.. _Qwen3-8B-32k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
+
+ +-----------------+----------------+----------+-------------------+
+ | 模型 | 推荐NPU型号 | 节点数量 | 训推后端 |
+ +=================+================+==========+===================+
+ | `Qwen3-30B`_ | Atlas 800T A3 | 1 | SGLang + Megatron |
+ +-----------------+----------------+----------+-------------------+
+ | `Qwen2.5-32B`_ | Atlas 900 A2 | 2 | SGLang + FSDP |
+ +-----------------+----------------+----------+-------------------+
+ | `Qwen3-8B-1k`_ | Atlas A3/A2 | 1 | SGLang + FSDP |
+ +-----------------+----------------+----------+-------------------+
+ | `Qwen3-8B-32k`_ | Atlas A3/A2 | 1 | SGLang + FSDP |
+ +-----------------+----------------+----------+-------------------+
+
+**2.最佳实践**
+
+我们提供基于verl+sglang `Qwen3-30B`_ 以及 `Qwen2.5-32B`_ 的 `最佳实践 `_ 作为参考
+
+**3.环境变量与参数**
+
+当前NPU上支持sglang后端必须添加以下环境变量
+
+.. code-block:: bash
+
+ #支持NPU单卡多进程 https://www.hiascend.com/document/detail/zh/canncommercial/850/commlib/hcclug/hcclug_000091.html
+ export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+ export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+ #规避ray在device侧调用无法根据is_npu_available接口识别设备可用性
+ export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+ #根据当前设备和需要卡数定义
+ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ #使能推理EP时需要
+ export SGLANG_DEEPEP_BF16_DISPATCH=1
+
+
+
+当前verl已解析推理常见参数, 详见 `async_sglang_server.py `_ 中 ServerArgs初始化传参,其他 `sglang参数 `_ 均可通过engine_kwargs 进行参数传递
+
+vllm后端推理脚本转换为sglang, 需要添加修改以下参数
+
+.. code-block:: bash
+
+ #必须
+ actor_rollout_ref.rollout.name=sglang
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
+ #可选
+ #使能推理EP,详细使用方法见 https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README_CN.md
+ ++actor_rollout_ref.rollout.engine_kwargs.sglang.deepep_mode="auto"
+ ++actor_rollout_ref.rollout.engine_kwargs.sglang.moe_a2a_backend="deepep"
+ #Moe模型多DP时必须设置为True
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
+ #chunked_prefill默认关闭
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
+
+
+
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e9624d7a6d5ad09ce95b633f8d09437c85d4e946
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst
@@ -0,0 +1,82 @@
+Ascend Dockerfile Build Guidance
+===================================
+
+Last updated: 12/4/2025.
+
+我们在verl上增加对华为昇腾镜像构建的支持。
+
+
+镜像硬件支持
+-----------------------------------
+
+Atlas 200T A2 Box16
+
+Atlas 900 A2 PODc
+
+Atlas 800T A3
+
+
+镜像内各组件版本信息清单
+----------------
+
+================= ============
+组件 版本
+================= ============
+基础镜像 Ubuntu 22.04
+Python 3.11
+CANN 8.3.RC1
+torch 2.7.1
+torch_npu 2.7.1
+torchvision 0.22.1
+vLLM 0.11.0
+vLLM-ascend 0.11.0rc1
+Megatron-LM v0.12.1
+MindSpeed (f2b0977e)
+triton-ascend 3.2.0rc4
+mbridge latest version
+SGLang v0.5.8
+sgl-kernel-npu (46b73de)
+================= ============
+
+
+Dockerfile构建镜像脚本清单
+---------------------------
+
+============== ============== ============== ==============================================================
+设备类型 基础镜像版本 推理后端 参考文件
+============== ============== ============== ==============================================================
+A2 8.2.RC1 vLLM `Dockerfile.ascend_8.2.rc1_a2 `_
+A2 8.3.RC1 vLLM `Dockerfile.ascend_8.3.rc1_a2 `_
+A2 8.3.RC1 SGLang `Dockerfile.ascend.sglang_8.3.rc1_a2 `_
+A3 8.2.RC1 vLLM `Dockerfile.ascend_8.2.rc1_a3 `_
+A3 8.3.RC1 vLLM `Dockerfile.ascend_8.3.rc1_a3 `_
+A3 8.3.RC1 SGLang `Dockerfile.ascend.sglang_8.3.rc1_a3 `_
+============== ============== ============== ==============================================================
+
+
+镜像构建命令示例
+--------------------
+
+.. code:: bash
+
+ # Navigate to the directory containing the Dockerfile
+ cd {verl-root-path}/docker/ascend
+
+ # Build the image
+ # vLLM
+ docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend:8.3.rc1-a2 .
+ # SGLang
+ docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend-sglang:8.3.rc1-a2 .
+
+公开镜像地址
+--------------------
+
+昇腾在 `quay.io/ascend/verl `_ 中托管每日构建的 A2/A3 镜像,基于上述 Dockerfile 构建。
+
+每日构建镜像名格式:verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-latest
+
+verl release版本镜像名格式:verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-{verl release版本号}
+
+声明
+--------------------
+verl中提供的ascend相关Dockerfile、镜像皆为参考样例,可用于尝鲜体验,如在生产环境中使用请通过官方正式途径沟通,谢谢。
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7a11299fa356c33fa5a4e3f11b0f179663a41de
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst
@@ -0,0 +1,296 @@
+Ascend SGLang Best Practice
+===================================
+
+Last updated: 01/27/2026.
+
+.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
+.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh
+引言
+----------------------------------
+
+SGLang 是当前主流的高性能开源推理引擎, 昇腾已经全面原生支持该推理引擎在verl中使用,
+仅需简单的构建流程,开发者即可完成环境构建,本文将提供两个经典用例来帮助开发者了解以下内容:
+
+1. 环境构建
+2. 模型训练与评估
+3. 性能采集
+
+两个用例模型脚本以及其需要的硬件条件各自如下:
+
++----------------------+---------------------+----------+------------------------+
+| 模型 | NPU型号 | 节点数量 | 训推后端 |
++======================+=====================+==========+========================+
+| `Qwen3-30B`_ | Atlas 800T A3 | 1 | SGLang + Megatron |
++----------------------+---------------------+----------+------------------------+
+| `Qwen2.5-32B`_ | Atlas 900 A2 | 2 | SGLang + FSDP |
++----------------------+---------------------+----------+------------------------+
+
+环境构建
+-----------------------------------
+我们在quickstart中提供了两种构建环境的方法, 1.从镜像文件DockerFile进行构建 2.从自定义Conda环境进行构建
+
+在本实践中, 我们额外指定verl 的commit id 以避免引入其他问题
+
+.. code-block:: bash
+
+ cd verl
+ git checkout 772c224
+模型训练与评估
+-----------------------------------
+1.模型数据准备
+^^^^^^^^^^^
+`Qwen3-30B`_
+^^^^^^^^^^^
+**下载模型权重**
+
+--local-dir: 模型保存路径
+
+.. code-block:: bash
+
+ export HF_ENDPOINT=https://hf-mirror.com
+ hf download --resume-download Qwen/Qwen3-30B-A3B --local-dir /path/to/local_dir
+
+**下载数据集**
+
+.. code-block:: bash
+
+ git clone https://www.modelscope.cn/datasets/AI-ModelScope/DAPO-Math-17k.git
+
+**HuggingFace To Megatron权重转换(可选)**
+
+.. code-block:: bash
+
+ python scripts/converter_hf_to_mcore.py \
+ --hf_model_path Qwen/Qwen3-30B-A3B \
+ --output_path Qwen/Qwen3-30B-A3B-mcore \
+ --use_cpu_initialization # Only work for MoE models
+*注:verl当前已支持mbridge进行灵活的hf和mcore之间的权重转换,可以修改以下相关参数直接加载hf权重*
+
+.. code-block:: bash
+
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
+ actor_rollout_ref.actor.megatron.use_mbridge=True
+
+`Qwen2.5-32B`_
+^^^^^^^^^^^
+**下载模型权重**
+
+--local-dir: 模型保存路径
+
+.. code-block:: bash
+
+ export HF_ENDPOINT=https://hf-mirror.com
+ hf download --resume-download Qwen/Qwen2.5-32B --local-dir /path/to/local_dir
+
+**下载及处理数据集**
+
+.. code-block:: bash
+
+ wget https://huggingface.co/datasets/agentica-org/DeepScaleR-Preview-Dataset/resolve/main/deepscaler.json
+ python recipe/r1_ascend/json_to_parquet.py --output_dir ./data/deepscaler --json_path path/to/deepscaler.json --train_data_ratio 0.9
+
+2.训练
+^^^^^^^^^^^
+根据开发者实际路径配置情况修改模型训练脚本中的以下参数
+
+.. code-block:: bash
+
+ # Model Weights Paths
+ MODEL_PATH=Qwen/Qwen3-30B-A3B
+ MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-mcore
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+
+ # File System Paths
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
+ TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
+
+ #保存频率,-1默认不保存,如需评测请修改此参数
+ trainer.save_freq=-1
+
+对于单机任务 `Qwen3-30B`_ , 可以直接bash执行verl仓上示例脚本
+
+.. code-block:: bash
+
+ bash examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
+对于多节点任务 `Qwen2.5-32B`_ ,我们推荐使用以下脚本进行大规模多节点训练拉起
+
+.. code-block:: bash
+
+ pkill -9 python
+ ray stop --force
+ rm -rf /tmp/ray
+ export RAY_DEDUP_LOGS=0
+ export HYDRA_FULL_ERROR=1
+ # TASK_QUEUE_ENABLE,下发优化,图模式设置为1,非图模式设置为2
+ export TASK_QUEUE_ENABLE=1
+ export HCCL_ASYNC_ERROR_HANDLING=0
+ export HCCL_EXEC_TIMEOUT=3600
+ export HCCL_CONNECT_TIMEOUT=3600
+
+ export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+ export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+ export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8
+ # 修改为当前需要跑的用例路径
+ DEFAULT_SH="./run_*.sh"
+ echo "Use $DEFAULT_SH"
+
+ ulimit -n 32768
+ mkdir logs
+
+ NNODES=2
+ NPUS_PER_NODE=8
+ # 修改为对应主节点IP
+ MASTER_ADDR="IP FOR MASTER NODE"
+ # 修改为当前节点的通信网卡
+ SOCKET_IFNAME="Your SOCKET IFNAME"
+ export HCCL_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
+ export GLOO_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
+ # 获取当前IP
+ CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}')
+ if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then
+ # 主节点启动
+ ray start --head --port 6766 --dashboard-host=$MASTER_ADDR --node-ip-address=$CURRENT_IP --dashboard-port=8260 --resources='{"NPU": '$NPUS_PER_NODE'}'
+
+ while true; do
+ ray_status_output=$(ray status)
+ npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1)
+ npu_count_int=$(echo "$npu_count" | awk '{print int($1)}')
+ device_count=$((npu_count_int / $NPUS_PER_NODE))
+
+ # 判断device_count 是否与 NNODES 相等
+ if [ "$device_count" -eq "$NNODES" ]; then
+ echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script."
+ ray status
+ bash $DEFAULT_SH
+ break
+ else
+ echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count"
+ sleep 5
+ fi
+ done
+ else
+ # 子节点尝试往主节点注册 ray 直到成功
+ while true; do
+ # 尝试连接 ray 集群
+ ray start --address="$MASTER_ADDR:6766" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP
+
+ # 检查连接是否成功
+ ray status
+ if [ $? -eq 0 ]; then
+ echo "Successfully connected to the Ray cluster!"
+ break
+ else
+ echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..."
+ sleep 5
+ fi
+ done
+ fi
+
+ sleep 600
+
+DEFAULT_SH:修改为训练所用配置 sh 文件路径。在此案例中修改为 `Qwen2.5-32B`_ 路径。
+
+NNODES 和 NPUS_PER_NODE:修改为使用节点数量和每个节点 NPU 数量。在此案例中分别为2和8。
+
+MASTER_ADDR:修改为对应主节点 IP。即所有节点的 MASTER_ADDR 应该相同。
+
+SOCKET_IFNAME, HCCL_SOCKET_IFNAME, GLOO_SOCKET_IFNAME: 修改为对应通信网卡,通信网卡可以通过以下命令获取:
+
+.. code-block:: bash
+
+ ifconfig |grep "$(hostname -I |awk '{print $1}'|awk -F '.' '{print $0}')" -B 1|awk -F ':' '{print$1}' | head -1 | tail -1
+
+3.模型评估
+^^^^^^^^^^^
+
+不同模型步骤一致,仅以Qwen3-30b为例列举
+
+我们通过 AISBenchmark 评估模型,该工具支持vllm/sglang多种推理后端的评估
+
+**安装方法**
+
+.. code-block:: bash
+
+ git clone https://gitee.com/aisbench/benchmark.git
+ cd benchmark
+ pip install -e .
+
+**下载评估数据集**
+
+.. code-block:: bash
+
+ cd path/to/benchmark/ais_bench/datasets
+ wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip
+ unzip math.zip
+ rm math.zip
+
+**修改AISBench配置代码使能sglang推理评测**
+
+打开 benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat.py 文件,这是推理配置文件
+
+.. code-block:: bash
+
+ from ais_bench.benchmark.models import VLLMCustomAPIChatStream
+ from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content
+ from ais_bench.benchmark.clients import OpenAIChatStreamClient, OpenAIChatStreamSglangClient
+
+ models = [
+ dict(
+ attr="service",
+ type=VLLMCustomAPIChatStream,
+ abbr='sgl-api-stream-chat',
+ path="/path/to/Qwen3-30B", # 修改为 Qwen3-30B 模型路径
+ model="qwen3-30b",
+ request_rate = 0,
+ max_seq_len=2048,
+ retry = 2,
+ host_ip = "localhost", # 推理服务的IP
+ host_port = 8005, # 推理服务的端口
+ max_out_len = 8192, # 最大输出tokens长度
+ batch_size=48, # 推理的最大并发数
+ trust_remote_code=False,
+ custom_client=dict(type=OpenAIChatStreamSglangClient), #使用sglang客户端
+ generation_kwargs = dict(
+ temperature = 0,
+ seed = 1234,
+ ),
+ pred_postprocessor=dict(type=extract_non_reasoning_content)
+ )
+ ]
+
+
+**启动sglang_server服务**
+
+.. code-block:: bash
+
+ python -m sglang.launch_server --model-path "/path/to/Qwen3-30B" --tp-size 4 --dp-size 1 --port 8005
+
+**启动sglang_client评测**
+
+.. code-block:: bash
+
+ ais_bench --models vllm_api_stream_chat --datasets math500_gen_0_shot_cot_chat_prompt
+
+**评测结果**
+
+经过训练,模型在Math-500上的评分显著上升
+
++------+----------------------+---------+----------+------+----------------------+
+| iter | dataset | version | metric | mode | sgl-api-stream-chat |
++======+======================+=========+==========+======+======================+
+| 0 | math_prm800k_500 | c4b6f0 | accuracy | gen | 84.4 |
++------+----------------------+---------+----------+------+----------------------+
+| 150 | math_prm800k_500 | c4b6f0 | accuracy | gen | 91.7 |
++------+----------------------+---------+----------+------+----------------------+
+
+性能采集
+-----------------------------------
+关于NPU profiling的详细文档请参考 `ascend_profiling_zh `_
+
+在 `Qwen3-30B`_ 的脚本中提供了基本的采集性能选项PROF_CONFIG,默认设置 global_profiler.steps=null 关闭采集, 开发者可根据实际需要进行参数修改
+
+采集完成后,开发者可以使用 `MindStudio Insight `_ 进行数据解析
+
+注: verl框架侧进行采集全量 Profiling 产生海量且重复的算子记录,可以根据文档修改代码仅采集关键阶段
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
new file mode 100644
index 0000000000000000000000000000000000000000..62b0cc15bc7b9bd2872f673cc9cfa8ec06d662cb
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
@@ -0,0 +1,324 @@
+# DAPO 介绍
+
+Last updated: 01/27/2026.
+
+DAPO的论文可以参考:[DAPO](https://arxiv.org/pdf/2503.14476),其中包含以下几个关键技术。
+
+* **Clip-Higher**: 通过对重要性采样比的上限剪裁促进了系统的多样性并避免了熵坍缩(Entropy Collapse)。
+* **Dynamic Sampling**: 提高了训练效率和稳定性。DAPO出了一种执行动态采样的策略,并过滤掉准确率等于1和0的提示组,从而保持批次间具有有效梯度的提示数量一致。
+* **Token-level Policy Gradient Loss**: 在长链思维强化学习 (long-CoT RL) 场景中至关重要。
+* **Overlong Reward Shaping**: 减少奖励噪声并稳定了训练。
+
+在verl中,可以进行如下设置,从而进行DAPO算法的运行。
+
+- **奖励模型的管理策略为 DAPO**
+ 在dapo算法中,必须配置成dapo。
+
+```
+reward_model.reward_manager=dapo
+```
+
+- **Clip-Higher 更高裁剪 **
+ `clip_ratio_low` 和 `clip_ratio_high` 用于指定 DAPO 目标函数中的 $\varepsilon_{\text {low }}$ 和 $\varepsilon_{\text {high }}$。
+
+```
+clip_ratio_low=0.2 # 裁剪比例下限,默认值为0.2
+clip_ratio_high=0.28 # 裁剪比例上限,默认值为0.28
+```
+
+- **动态采样的相关配置 **
+ 将 `filter_groups.enable` 设置为 `True` 会过滤掉输出 `metric` 完全相同的组,例如对于 `acc` 指标,过滤掉输出准确率全部为 1 或 0 的组。
+ 训练器会使用 `gen_batch_size` 进行重复采样,直到生成足够数量的符合条件的组,或者达到 `max_num_gen_batches` 所指定的上限为止。
+
+```
+data.gen_batch_size=${gen_prompt_bsz}
+algorithm.filter_groups.enable=${enable_filter_groups} # 动态采样开关
+algorithm.filter_groups.metric=${filter_groups_metric} # 使用准确率作为过滤标准
+algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} # 最大生成批次数量,最多重复生成数据的次数
+```
+
+- **Token-level Loss **
+ 将 `loss_agg_mode` 设置为 `token-mean` 意味着计算一个批次中所有序列内所有 token 的(策略梯度)损失的平均值。
+
+```
+actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
+#注意:“token-mean”是默认行为。
+```
+
+- **奖励模型对超长回答的惩罚配置 **
+ 将 `overlong_buffer.enable` 设置为 `True` 将对输出长度过长但仍未超过硬上下文限制的输出进行惩罚。具体来说,当输出的长度超过 `max_response_length - overlong_buffer.len` 且超出 `0` 到 `overlong_buffer.len` 个 token 时,惩罚值会从 `0` 线性增加到 `overlong_buffer.penalty_factor`。
+
+```
+reward_model.overlong_buffer.enable=${enable_overlong_buffer} # 启用超长缓冲区惩罚,开启对超长输出的惩罚机制
+reward_model.overlong_buffer.len=${overlong_buffer_len} # 缓冲区长度,定义缓冲区的toke,最大惩罚强度
+reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} #惩罚因子,最大惩罚强度
+```
+
+相关参数涉及的代码可以参考:[Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)](https://github.com/verl-project/verl-recipe/blob/main/dapo/README.md)
+
+# 硬件要求
+
+当前支持Atlas 800T A3 与 Atlas 900 A3 SuperPoD。完成跑完本次最佳实践需要 2台Atlas 800T A3。关键软件版本可以参考:[Ascend Quickstart](https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/ascend_quick_start.rst)
+
+# 模型训练
+
+## 数据集准备
+
+Geometry3k 数据集是由加利福尼亚大学洛杉矶分校与浙江大学联合研发的几何领域专用数据集,核心面向视觉问答(VQA)任务展开研究与模型训练。该数据集总计包含 3002 个样本,采用图像和文本两种模态数据形式构建,其中文本模态涵盖各类几何问题描述,图像则以可视化图表呈现问题中的几何图形信息,包括三角形、圆形、四边形等基础几何形状,以及不同图形间的位置、嵌套、相交等关联关系。可以从Hugging Face库下载对应的原始数据集:[Geometry3k ](https://huggingface.co/datasets/hiyouga/geometry3k)
+
+```python
+# 下载原始数据并预处理
+python ./examples/data_preprocess/geo3k.py --local_dir=./data/geo3k
+```
+
+## 权重下载
+
+从Hugging Face库下载对应的模型权重:[Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct/tree/main
+)
+
+## 全局变量导入
+
+- 为了确保 Ray 进程能够正常回收内存,需要安装并使能 jemalloc 库进行内存管理,用于更好管理内存,避免长跑过程中内存 OOM。
+
+```
+# 根据实际安装路径设置 jemalloc 环境变量
+export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
+```
+
+- 某些模型是通过 vllm ascend 进行优化的。但在某些情况下,优化后的模型可能并不适用。此时,将此值设置为 0 即可禁用优化后的模型。
+
+```
+export USE_OPTIMIZED_MODEL=0
+```
+
+- 启用vLLM V1
+
+```
+export VLLM_USE_V1=1
+```
+
+昇腾多卡通信的兜底配置,延长连接超时时间,避免集群环境下训练启动因连接慢而失败
+
+```
+export HCCL_CONNECT_TIMEOUT=5400
+```
+
+- 控制 vLLM 在昇腾芯片上是否启用NZ优化
+
+```
+export VLLM_ASCEND_ENABLE_NZ=0
+```
+
+- 根据使用机器的情况,修改相关配置, 例如双机机 A2 可设置`trainer.nnodes`为 1 、`trainer.n_gpus_per_node`为8
+
+## 训练脚本
+
+基于以上修改,提供了示例配置文件,创建 run_dapo_qwen3_vl_30b.sh 文件。
+
+```bash
+set -xeuo pipefail
+
+export VLLM_USE_V1=1
+export HCCL_CONNECT_TIMEOUT=5400
+export VLLM_ASCEND_ENABLE_NZ=0
+export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
+# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
+# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
+export USE_OPTIMIZED_MODEL=0
+
+project_name='DAPO'
+exp_name='DAPO-Qwen3-vl-30B'
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=1024
+max_response_length=2048
+enable_overlong_buffer=False
+overlong_buffer_len=$((1024 * 2))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+
+enable_filter_groups=True
+filter_groups_metric=acc
+max_num_gen_batches=4
+train_prompt_bsz=64
+gen_prompt_bsz=$((train_prompt_bsz * 3))
+n_resp_per_prompt=8
+train_prompt_mini_bsz=16
+
+# Ray
+PWD=./
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-30B-A3B-Instruct"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/geo3k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/geo3k/test.parquet"}
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+sp_size=8
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+gen_tp=8
+fsdp_size=16
+
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+ --working-dir "${WORKING_DIR}" \
+ --address "${RAY_ADDRESS}" \
+ -- python3 -m recipe.dapo.main_dapo \
+ data.train_files="${TRAIN_FILE}" \
+ data.val_files="${TEST_FILE}" \
+ data.prompt_key=prompt \
+ data.truncation='left' \
+ data.max_prompt_length=${max_prompt_length} \
+ data.max_response_length=${max_response_length} \
+ data.gen_batch_size=${gen_prompt_bsz} \
+ data.train_batch_size=${train_prompt_bsz} \
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+ algorithm.adv_estimator=${adv_estimator} \
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+ actor_rollout_ref.actor.clip_ratio_c=10.0 \
+ algorithm.filter_groups.enable=${enable_filter_groups} \
+ algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
+ algorithm.filter_groups.metric=${filter_groups_metric} \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+ actor_rollout_ref.actor.use_torch_compile=False \
+ actor_rollout_ref.actor.entropy_coeff=0 \
+ actor_rollout_ref.actor.grad_clip=1.0 \
+ actor_rollout_ref.rollout.enforce_eager=True \
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.70 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
+ actor_rollout_ref.rollout.temperature=${temperature} \
+ actor_rollout_ref.rollout.top_p=${top_p} \
+ actor_rollout_ref.rollout.top_k="${top_k}" \
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
+ actor_rollout_ref.rollout.expert_parallel_size=8 \
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+ actor_rollout_ref.rollout.name=vllm \
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+ actor_rollout_ref.actor.strategy=fsdp2 \
+ actor_rollout_ref.ref.strategy=fsdp2 \
+ critic.strategy=fsdp2 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+ reward_model.reward_manager=dapo \
+ reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
+ reward_model.overlong_buffer.len=${overlong_buffer_len} \
+ reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+ trainer.logger=console \
+ trainer.project_name="${project_name}" \
+ trainer.experiment_name="${exp_name}" \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=2 \
+ trainer.val_before_train=True \
+ trainer.test_freq=1 \
+ trainer.save_freq=20 \
+ trainer.resume_mode=auto \
+ trainer.device=npu \
+ trainer.total_epochs=30 \
+ trainer.total_training_steps=100 \
+ trainer.default_local_dir="${CKPTS_DIR}"
+```
+
+# 优化参考
+
+- **启动动态批次大小**
+ 根据单 GPU 的最大 Token 总数(ppo_max_token_len_per_gpu)动态调整批次大小
+
+```
+actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+```
+
+- **单个 GPU 能处理的最大 Token 总数**
+ 当`use_dynamic_bsz=True`时,单 GPU 在一个微批次中能处理的最大 Token 数量
+
+```
+actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+```
+
+- **单个 GPU 微批次大小**
+ 当`use_dynamic_bsz=True`时,框架会以该值为初始批次大小,再根据`ppo_max_token_len_per_gpu`向上 / 向下调整
+
+```
+actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2
+```
+
+- **启用 FSDP2 框架**
+ “将模型参数、梯度、优化器状态分片存储在不同 GPU 上”,避免单卡加载全量模型导致显存溢出。
+
+```
+# 启用 FSDP2 框架
+actor_rollout_ref.actor.strategy=fsdp2
+actor_rollout_ref.ref.strategy=fsdp2
+critic.strategy=fsdp2
+
+# 仅用于 FSDP2:前向传播后重新分片以减少内存占用。
+actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True
+# 仅用于 FSDP2:是否在模型前向传播后重新分片以节省内存。
+actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True
+```
+
+- **启用专家并行配置**
+ 指定有多少个 GPU用于并行计算不同的专家网络
+
+```
+# MoE 架构 Actor 模型的专家并行配置
+actor_rollout_ref.rollout.expert_parallel_size=8
+```
+
+
diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md
new file mode 100644
index 0000000000000000000000000000000000000000..e943fcdbfff6b68b11a941990669b8cec8990391
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md
@@ -0,0 +1,233 @@
+## NPU Qwen3-32B GSPO Optimization Practice
+
+Last updated: 01/27/2026.
+
+本文章对应脚本地址:[qwen3_32b_gspo_npu](https://github.com/volcengine/verl/blob/main/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh)
+
+### 算法适配
+
+GSPO通过将优化颗粒度从**token级**提升到**sequence级**,规避了GRPO会遇到的**方差急剧增大**导致训练不稳定的情况,增加了训练的稳定性,同时该算法也在一定程度上提升了算法的收敛速度。
+
+想要成功在verl仓库中成功调用到GSPO算法,需要进行如下的必要配置
+
+~~~python
+# 核心算法配置
+algorithm.adv_estimator=grpo \ # 使用GRPO优势估计器
+algorithm.use_kl_in_reward=False \ # 不在奖励中添加KL惩罚
+# GSPO策略损失模式
+actor_rollout_ref.actor.policy_loss.loss_mode=gspo \ # 启用GSPO策略损失
+# 极小裁剪范围(GSPO特色)
+actor_rollout_ref.actor.clip_ratio_low=0.0003 \ # 裁剪下界,论文推荐值
+actor_rollout_ref.actor.clip_ratio_high=0.0004 \ # 裁剪上界,论文推荐值
+# KL配置(GSPO不使用KL loss)
+actor_rollout_ref.actor.use_kl_loss=False \ # 禁用KL损失
+actor_rollout_ref.actor.kl_loss_coef=0.0 \ # KL损失系数设为0
+# 序列级损失聚合模式(GSPO核心)
+actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-mean \ # 序列级平均,GSPO论文推荐
+# 批次配置
+actor_rollout_ref.rollout.n=16 \ # 每个prompt生成16个响应(组采样)
+~~~
+
+一般选择入口函数为`verl.trainer.main_ppo`
+
+### 性能调优
+
+优化从训练、推理、调度和其他四个方面入手。
+
+#### 训练
+
+##### 动态bsz
+
+~~~bash
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+~~~
+
+**这个优化点主要调整上面这两个参数,不过需要注意这两个参数调整的太大会导致OOM**
+
+**主要调整**`actor_ppo_max_token_len`,调大了会降低训练的耗时,调整`infer_ppo_max_token_len`没有明显的收益,可以不动
+
+**这两个参数的作用介绍如下:**
+
+**这两个参数用于控制动态批处理(dynamic batch size)模式下每个GPU处理的最大token数量**
+
+- **`actor_ppo_max_token_len`**: Actor模型在PPO更新(前向+反向传播)时每个GPU能处理的最大token数
+- **`infer_ppo_max_token_len`**: 推理阶段(Reference policy和Rollout)计算log概率时每个GPU能处理的最大token数
+
+#### 推理
+
+##### ACLgraph+FULL_DECODE_ONLY
+
+推理算子下发方面的优化,平均能有`15%~20%`左右的性能收益。
+
+先看单开**ACLgraph**,如下:
+
+~~~bash
+# 开启ACLgraph+FULL_DECODE_ONLY(注意:当设置此参数为False时,TASK_QUEUE_ENABLE必须设置为1,不然会报错)
+actor_rollout_ref.rollout.enforce_eager=False
+actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes='[8,16,32,64,128]' \
+actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode='FULL_DECODE_ONLY' \
+~~~
+
+`FULL_DECODE_ONLY`开启成功后有如下输出:
+
+
+
+**`cudagraph_capture_sizes`参数设置指南**
+
+cudagraph_capture_sizes设置的值对应的是批大小,这里的批大小不是配置里的DP域对应的那个批次大小,这里是相较于vllm来说的批大小,单位为**token**
+
+默认生成的算法如下,可做参考
+
+
+
+##### 推理后端切换
+
+使用方式:`export VLLM_ATTENTION_BACKEND=XFORMERS`
+
+
+
+注:需要注意某些后端在一些比较老的vllm-ascend版本内并不支持
+
+##### 使能vllm v1版本
+
+使用方式:`export VLLM_USE_V1=1`
+
+可以常开,一般都是正收益。
+
+#### 调度
+
+##### AIV
+
+打开方式:设置`export HCCL_OP_EXPANSION_MODE="AIV"`
+
+HCCL_OP_EXPANSION_MODE环境变量用于配置通信算法的编排展开位置,支持如下取值:
+
+- AI_CPU:代表通信算法的编排展开位置在Device侧的AI CPU计算单元。
+- AIV:代表通信算法的编排展开位置在Device侧的Vector Core计算单元。
+- HOST:代表通信算法的编排展开位置为Host侧CPU,Device侧根据硬件型号自动选择相应的调度器。
+- HOST_TS:代表通信算法的编排展开位置为Host侧CPU,Host向Device的Task Scheduler下发任务,Device的Task Scheduler进行任务调度执行。
+
+下面介绍两种展开机制
+
+###### HOST展开
+
+
+
+- 软件栈工作在hostcpu,通信算法展开一个个task
+- 每个task调用runtime接口,下发到device的rtsqueue
+- STARS从rstqueue上顺序拿取task
+- 根据task类型分别调用掉SDMA和RDMA引擎。
+ **单算子瓶颈**:hostbound 每个task提交是2~5us,一个通信算子有几百个task,单算子场景不会在device上缓存,下发一个执行一个
+
+###### AICpu机制展开
+
+
+
+- host侧不下发一个个task,把通信算子作为一个个kernel,放在通信算子kernel的队列上去。
+- STARS调度kernel队列流上的kernel,把kernel放到AiCPU上去执行。
+- AICPU调用函数(kernel),用一个线程执行kernel 函数,在函数内把通信task展开,把task放到rstqueue上,STARS调用。
+- 降低host和aicpu交互,由几百次降低为一次。
+- task的提交在AICPU上提交,做了提交的部分合并。
+
+##### TASK_QUEUE_ENABLE
+
+**使用方式:**`export TASK_QUEUE_ENABLE=2`
+
+TASK_QUEUE_ENABLE,下发优化,图模式设置为1(即开启图模式的时候这个要设置为1),非图模式设置为2
+
+示意图:
+
+
+
+##### 绑核优化
+
+**使用方式:**`export CPU_AFFINITY_CONF=1`
+
+详细设置原理可看:https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0059.html
+
+#### 其他
+
+以下内容汇总了若干全局环境变量的调优配置。由于这些参数在训练阶段与推理阶段往往都能带来正向收益,且目前尚缺乏足够精细的消融实验来严格区分它们各自对训练或推理的贡献占比,故统一归拢在此,供后续持续监控与进一步拆解分析。
+
+##### 使能jemalloc
+
+使用方式(注意需要先安装jemalloc库):`export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2`
+
+**安装使用教程:**[MindSpeed-RL/docs/install_guide.md · Ascend/MindSpeed-RL - AtomGit | GitCode](https://gitcode.com/Ascend/MindSpeed-RL/blob/master/docs/install_guide.md#高性能内存库-jemalloc-安装)
+
+##### 多流复用
+
+内存方面有优化
+
+使能方式:`export MULTI_STREAM_MEMORY_REUSE=1`
+
+原理介绍:https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0040.html
+
+##### VLLM_ASCEND_ENABLE_FLASHCOMM
+
+使用方式:`export VLLM_ASCEND_ENABLE_FLASHCOMM=1`
+
+启用昇腾 NPU 特有的FLASHCOMM高速通信优化技术
+
+地址:https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html
+
+##### VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE
+
+使用方式:`export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1`
+
+启用昇腾 NPU针对大模型推理的稠密计算优化
+
+地址:https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html
+
+##### VLLM_ASCEND_ENABLE_PREFETCH_MLP
+
+使用方式:`export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1`
+
+启用 MLP 层的权重预取机制
+
+
+
+##### verl框架参数设置
+
+主要是内存方面的一些设置开关(注意,这个里面的优化都或多或少会导致吞吐量有一定程度的劣化)
+
+~~~bash
+# 梯度检查点 (Gradient Checkpointing)
+# 作用: 通过重新计算激活值来节省显存,以计算换内存。在前向传播时不保存中间激活值,反向传播时重新计算,可以显著降低显存占用,允许使用更大的batch size。
+actor_rollout_ref.model.enable_gradient_checkpointing=True
+
+# 参数卸载 (Parameter Offload)
+# 作用: 将模型参数卸载到CPU内存,训练时再加载回GPU。
+actor_rollout_ref.actor.fsdp_config.param_offload=${offload} # True
+actor_rollout_ref.ref.fsdp_config.param_offload=${offload} # True
+
+# 优化器状态卸载 (Optimizer Offload)
+# 作用: 将优化器状态(如Adam的动量)卸载到CPU。优化器状态通常占用大量显存(对于Adam,每个参数需要额外8字节),卸载可以节省显存。
+actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} # True
+
+# 释放推理引擎缓存 (Free Cache Engine)
+# 作用: 在训练阶段释放推理引擎的KV cache和权重。这是3D-HybridEngine的核心优化,允许在同一GPU上交替进行推理和训练,显著降低显存需求。
+actor_rollout_ref.rollout.free_cache_engine=True
+
+# 熵计算优化
+# entropy_checkpointing: 在训练时对熵计算启用重计算,降低显存峰值
+# entropy_from_logits_with_chunking: 分块处理logits张量(如2048 tokens一组),避免一次性加载整个[bsz*seq_len, vocab]张量
+actor_rollout_ref.actor.entropy_checkpointing=True
+actor_rollout_ref.ref.entropy_checkpointing=True
+actor_rollout_ref.actor.entropy_from_logits_with_chunking=True
+actor_rollout_ref.ref.entropy_from_logits_with_chunking=True
+
+# 推理引擎显存配置
+# gpu_memory_utilization: 控制vLLM使用的GPU显存比例(0.90 = 90%)
+# enforce_eager=False: 启用CUDA graphs加速推理,但会占用额外显存
+actor_rollout_ref.rollout.gpu_memory_utilization=0.90
+actor_rollout_ref.rollout.enforce_eager=False
+~~~
+
+### NPU调优参考文章
+
+环境变量相关:[环境变量列表-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/apiref/Envvariables/Envir_001.html)
+
+社区性能调优教程:[性能调优流程-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0001.html)
+
diff --git a/code/RL_model/verl/verl_train/docs/blog/v0.7.md b/code/RL_model/verl/verl_train/docs/blog/v0.7.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bf3c31c3e9cd771451546a825cf9a74504c1cb7
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/blog/v0.7.md
@@ -0,0 +1,274 @@
+# verl 0.7 release blog
+
+**Author:** verl team
+
+Last updated: 01/03/2026.
+
+## Overview
+verl adopts a Hybrid-Controller architecture (also known as HybridFlow). Sharing design principles with asynchronous sharded dataflow systems like Google Pathways, verl models Reinforcement Learning (RL) algorithms, such as PPO, GRPO, DAPO, and others, as a multi-stage, multi-model and parallelizable dataflow graph.
+
+To balance flexibility with performance, verl unifies two distinct programming models:
+
+**High-Level Single-Controller (MPMD)**: At the orchestration level, a single process `RLTrainer` manages the global computation graph. It handles macro-tasks such as scheduling rollout generation, triggering reward scoring, and dispatching distributed training jobs.
+
+**Internal Multi-Controller (SPMD)**: Internally, the Model Engine operates in standard distributed training mode. Workers execute identical programs, via trainer backends like FSDP, Megatron, or VeOmni, or rollout executors (not rollout server) like vLLM/SGLang/TensorRT-LLM, to perform heavy distributed computation, synchronizing via collective communication.
+
+
+

+
+
+This hybrid approach offers significant advantages:
+
+**Flexible Orchestration**: The single-controller design allows verl to dynamically manage complex constraints within the computation graph, including flexible data dependencies, diverse resource allocation and model placement, and fine-grained asynchronous staleness control.
+
+**Abstraction of Complexity**: We encapsulate complex parallel strategies—such as 5D parallelism (DP, TP, CP, PP, and EP)—strictly within the Model Engine. This allows users to focus entirely on RL algorithm implementation without getting bogged down by the details of distributed training.
+
+Furthermore, leveraging Ray placement groups, verl provides `ResourcePool` and `WorkerGroup` abstractions. These enable flexible GPU sharing among the various roles in the RL process—such as actor, critic, reward, and rollout—allowing components to share resources efficiently while remaining isolated.
+
+As illustrated in the diagram below, the overall architecture of verl is divided into two layers:
+
+- **verl-core**: provides four components required for the RL pipeline: model engine, rollout engine, checkpoint engine, and transfer queue. Each component exposes abstract interfaces, making them both extensible and pluggable.
+- **verl-trainer**: builds upon these components, construct various RL pipelines—such as on-policy, one-step-off-policy, and fully asynchronous—tailored to meet the demands of diverse scenarios.
+
+
+

+
+
+
+## verl-core
+### Model Engine
+
+The Model Engine serves as verl's core training engine, defining a set of abstract interfaces that support pluggable backends. It operates in SPMD mode:
+- SFT: Workers are launched via torchrun.
+- RL: Workers are executed via the WorkerGroup API, invoked by the single-controller.
+
+The abstract interfaces include methods like `initialize`, `forward`, `optimizer_step`, and `load`/`offload`. Integrating a new training engine simply requires inheriting and implementing these interfaces. Crucially, because all backends adhere to this unified abstraction, adding a new Model Engine requires absolutely no code modification on the caller side. The RLTrainer remains completely agnostic to the backend's specific parallel strategy when calling these interfaces, while the WorkerGroup automatically handles data dispatch and collection based on the underlying parallelism.
+
+Currently, the Model Engine supports the following backends (more backend maybe supported in future, e.g torchtitan):
+|Backend|Parallelism|Performance|Support Model|New Model Support Time
+|-----|-----|----|----|----|
+|FSDP| FSDP+SP|Dense medium/MoE low| all transformer models|Day 0
+|MCore| DP+TP+PP+EP+CP|High| see [Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge) support model list|few weeks or month
+|VeOmni| FSDP+SP+EP|Medium| see [VeOmni](https://github.com/ByteDance-Seed/VeOmni) support model list|~1 week
+
+```python
+class BaseEngine:
+ def initialize(self):
+ """Instantiate or load the model, optimizer, and learning rate scheduler."""
+ raise NotImplementedError
+
+ def optimizer_zero_grad(self):
+ """Zero the gradients of the optimizer."""
+ raise NotImplementedError
+
+ def optimizer_step(self):
+ """Perform an optimization step using the optimizer."""
+ raise NotImplementedError
+
+ def lr_scheduler_step(self):
+ """Advance the learning rate scheduler by one step."""
+ raise NotImplementedError
+
+ def forward_backward_batch(self, data: TensorDict, loss_function: Callable, forward_only=False) -> Any:
+ """Perform a forward pass and optionally a backward pass on a batch of data."""
+ raise NotImplementedError
+
+ def get_per_tensor_param(self) -> tuple[Generator[tuple[str, torch.Tensor], None, None], Optional[dict]]:
+ """Get a generator that yields per-tensor parameters and optional peft config."""
+ raise NotImplementedError
+
+ def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True):
+ """Move model parameters, optimizer states, or both to the specified device."""
+ raise NotImplementedError
+```
+
+
+### Rollout Engine
+As LLM reinforcement learning evolves from single-turn, static tasks to multi-turn, dynamic, and interactive agentic tasks, the legacy SPMD rollout mode previously used by verl has become insufficient. Consequently, in verl v0.7, we have removed the SPMD rollout mode and switched to rollout server mode by default.
+
+
+

+
+
+In the server mode, the LLM server operates as online serving rather than the traditional offline batch inference. Clients send per-sample requests to the server, enabling the engine to utilize dynamic batching. This significantly enhances throughput efficiency for multi-turn conversation. Furthermore, the server-based approach eliminates the need for intrusive modifications to the LLM inference engine, allowing for the seamless integration of modern inference backends such as vLLM, SGLang, and TensorRT-LLM.
+
+On the client side, verl introduces an extensible **AgentLoop** abstraction designed to define custom agentic task loops. This abstraction manages the cycle of requesting responses from the LLM server and interacting with external environments to obtain feedback. We provide two default implementations:
+- **SingleTurnAgentLoop**: Designed for standard single-turn tasks.
+- **ToolAgentLoop**: Designed for classic ReAct architectures involving multi-turn tool invocation.
+
+Users can implement custom AgentLoop logic tailored to their specific needs, such as [SWEAgentLoop](https://github.com/volcengine/verl/pull/4080) or GUIAgentLoop.
+
+```python
+class AgentLoopBase(ABC):
+ @abstractmethod
+ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
+ """Run agent loop to interact with LLM server and environment.
+
+ Args:
+ sampling_params (Dict[str, Any]): LLM sampling params.
+ **kwargs: dataset fields from `verl.utils.dataset.RLHFDataset`.
+
+ Returns:
+ AgentLoopOutput: Agent loop output.
+ """
+ raise NotImplementedError
+```
+
+### TransferQueue
+As mentioned, verl uses a global single-controller RLTrainer to orchestrate the computation graph. A major limitation in the current implementation is that the RLTrainer handles both control and data flow, creating a bottleneck when dispatching data between components. This issue is amplified by the massive data volumes in multimodal training (images, video, audio) and complex algorithms like router replay, which requires transmitting large tensors per sample. Our earlier attempt to solve this using the Ray object store yielded poor performance due to the lack of tensor optimization and fine-grained column access.
+
+
+

+
+
+In v0.7, we experimentally introduced **TransferQueue** to decouple control flow from data flow. The RLTrainer now only dispatch instructions and metadata, while TransferQueue handles data transmission via reference passing. TransferQueue is specifically optimized for PyTorch tensors (supporting zero-copy and RDMA) and allows for backend extensions like ZeroMQ, NIXL, and Ray RDT. We plan to make this the default transmission method in v0.8.
+
+```python
+# In PPOTrainer
+def fit(self):
+ batch = next(dataloader)
+ gen_batch: BatchMeta = self.rollout_manager.generate_sequences(batch)
+ output: BatchMeta = self.actor_rollout_wg.compute_log_prob(gen_batch)
+ gen_batch = gen_batch.union(output)
+ output = self.actor_rollout_wg.update_actor(gen_batch)
+
+# In Worker
+def compute_log_prob(self, batch: BatchMeta) -> BatchMeta:
+ data = tq.get(batch)
+ output = self.actor.infer_batch(data=data)
+ return tq.put(output)
+```
+
+### Checkpoint Engine
+
+With the increase in LLM context lengths and the evolution of agentic tasks, the "long-tail" problem in rollout has become prominent, limiting the overall efficiency of RL training.
+
+To mitigate this, a viable strategy is moving from on-policy synchronous training to off-policy asynchronous training, e.g [Laminar](https://arxiv.org/abs/2510.12633), [Areal](https://arxiv.org/abs/2505.24298), [StreamRL](https://arxiv.org/abs/2504.15930), [LlamaRL](https://arxiv.org/pdf/2505.24034), [PipelineRL](https://arxiv.org/abs/2509.19128). This involves separating the rollout and model engines onto different nodes (a disaggregated architecture, as opposed to colocated), with data transmitted via queues. This separation alleviates the rollout long-tail issue and enables rollout elastic scaling, fault tolerance, and heterogeneous hardware. However, it introduces a new challenge: efficient cross-node parameter synchronization.
+
+
+

+
+
+To address this, we introduce the Checkpoint Engine: a unified abstraction layer designed to synchronize weights between various training and inference backends.
+- It provides three unified APIs to implement the streaming transmission of parameters.
+- Users can extend the Transport Layer implementation based on their specific infrastructure requirements (device, network, local cache, etc.).
+
+Currently, we provide two transport backends: NCCL (for broadcast collective communication) and NIXL (for P2P point-to-point communication).
+
+```python
+class CheckpointEngine(ABC):
+ @abstractmethod
+ async def send_weights(self, weights: Generator[tuple[str, torch.Tensor], None, None]):
+ """Send the weights of the model.
+
+ Args:
+ weights: A generator that yields the name of the weight tensor and the tensor itself.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ async def receive_weights(self) -> Generator[tuple[str, torch.Tensor], None, None]:
+ """Receive the weights of the model.
+
+ Yields:
+ A tuple of the name of the weight tensor and the tensor itself.
+ """
+ raise NotImplementedError
+```
+
+## verl-trainer
+Building upon the four core components provided by verl-core, verl-trainer constructs several RL training pipelines tailored to specific scenarios. These pipelines are designed to address training efficiency challenges across varying scales and requirements:
+
+**On-policy (Synchronous)**
+ - Main Features: Executes rollout and training serially, typically sharing GPU resources (Colocate). It strictly adheres to standard on-policy algorithm definitions, where training must wait for all samples to be generated.
+ - Scenarios: Best for baseline implementations, scenarios where strict algorithmic correctness is prioritized over training throughput.
+
+**One-step-off-policy (Async)**
+ - Main Features: Parallelizes generation and training by overlapping the current training step with the next batch's generation. It employs resource isolation and uses parameters from the previous step for rollout to minimize GPU idle time.
+ - Scenarios: Ideal for scenarios requiring moderate efficiency gains (20%–40%) while maintaining training stability very close to strict on-policy methods.
+
+**Fully async (Decoupled & Streaming)**
+ - Main Features: Completely decouples the Trainer and Rollouter onto separate nodes. It utilizes streaming data transfer, staleness control, and partial rollout mechanisms to maximize throughput and mitigate long-tail generation latency.
+ - Scenarios: Essential for large-scale training (e.g., 128+ GPUs) or complex reasoning tasks (e.g., long chain-of-thought) where generation latency significantly bottlenecks performance.
+
+
+

+
+
+## roadmap
+### v0.7 release
+
+**Model Engine**
+- Integrate Megatron-Bridge and support LoRA/PEFT, see blog post: [How We Build Trillion Parameter Reasoning RL with 10% GPUs](https://macaron.im/mindlab/research/building-trillion-parameter-reasoning-rl-with-10-gpus)
+- Support experimental fp8 training for megatron backend
+- Support new model for megatron backend: GPT-OSS, Qwen3-Next
+- Comprehensive support for new mode engine, FSDP and Megatron engine are production ready.
+ - Dispatch tensordict with nested tensor instead of padded DataProto
+ - Add TrainingWorker that resembles Tinker-like API
+ - Add VLM support for model engine, SFT and RL trainer
+ - Add model engine based critic model
+ - Implement ActorRolloutRefWorker by TrainingWorker, support different backend in one worker
+- New VeOmni engine added, still in alpha status.
+
+**Rollout Engine**
+- Remove SPMD rollout mode
+- Support blockwise fp8 rollout for vllm and sglang; support online quant for vllm with torchao
+- Experimental router replay support for vllm
+- Optimize multi-modal data fetch and preprocess, support video input
+- Upgrade to vllm==0.12.0; sglang==0.5.6
+
+**Reward**
+- Support hybrid reward scenarios, including generative, discriminative, rule-based rewards, and their combinations.
+- Refactor reward models into server mode, supporting both colocated and standalone deployments.
+- Introduce new reward managers to handle more complex scenarios, limited mode for request rate control and remote mode for CPU-intensive tasks.
+
+**Algorithm**
+- Add [CISPO](https://arxiv.org/pdf/2506.13585): Clipped IS-weight Policy Optimization
+- Add [SAPO](https://arxiv.org/abs/2511.20347): Soft Adaptive Policy Optimization
+
+**Recipe**
+- [NEW] VLA: add experimental support for VLA model
+- [NEW] [rhymerl](https://arxiv.org/abs/2508.18588): History Rhymes: Accelerating LLM Reinforcement Learning with RhymeRL
+- TransferQueue: support multiple data partition and optimize tensor zero-copy serialization
+- One-step-off-policy/Fully async: optimize weight synchronization by checkpoint engine with bucket and pipeline support.
+
+### v0.8
+
+**Model Engine**
+- Deprecate DataProto by Tensordict for zero padding transmission
+- Switch default to new model engine, mark legacy engine (fsdp_workers.py, megatron_workers.py) as deprecated
+- Feature parity between new and legacy model engine: LoRA/PEFT, etc
+- Polish VeOmni engine to production ready status
+- Support MTP RL training
+- Optimize GPU memory for long context: fine-grained activation recompuation/offload
+- New model support: DeepSeek V3.2, etc
+
+**Rollout Engine**
+- New rollout engine TensorRT-LLM
+- Separate vllm worker from trainer process, update weights by cuda ipc
+
+**TransferQueue**
+- Merge TransferQueue recipe into main
+- Optimize e2e image/video vlm training pipeline by TransferQueue
+- Optimize router replay transmission by TransferQueue
+
+**Checkpoint Engine**
+- Add checkpoint engine abstract interface
+- Add NCCL and NIXL transport backend
+- Add more transport backend
+
+### v0.9
+
+**Trainer**
+- Merge Full async into main: refactor with verl-core component
+
+**Model Engine**
+- Remove legacy model engine (fsdp_workers.py, megatron_workers.py)
+- Support omni-model RL training: Qwen3-Omni, BAGEL, etc
+
+**Rollout Engine**
+- New rollout engine vllm-omni
+
+**More agentic training recipe**
+- SWEAgent
+- GUIAgent
diff --git a/code/RL_model/verl/verl_train/docs/conf.py b/code/RL_model/verl/verl_train/docs/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbeabbd81b28e97fe0d0e8bcf436ab92f5833743
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/conf.py
@@ -0,0 +1,113 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = "verl"
+copyright = "2024 ByteDance Seed Foundation MLSys Team"
+author = "Guangming Sheng, Chi Zhang, Yanghua Peng, Haibin Lin"
+
+
+# -- General configuration ---------------------------------------------------
+# The master toctree document.
+master_doc = "index"
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ "myst_parser",
+ "sphinx.ext.autodoc",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.autosectionlabel",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.viewcode",
+]
+
+# MyST-Parser settings
+myst_enable_extensions = [
+ "dollarmath", # Enables $...$ and $$...$$ syntax
+ "amsmath", # Enables amsmath environments
+]
+
+# Use Google style docstrings instead of NumPy docstrings.
+napoleon_google_docstring = True
+napoleon_numpy_docstring = False
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+source_suffix = {
+ ".rst": "restructuredtext",
+ ".md": "markdown",
+}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+# Add the JavaScript file
+html_js_files = [
+ "js/runllm-widget.js",
+ "js/resizable-sidebar.js",
+]
+
+# Add custom CSS file for full-width layout
+html_css_files = [
+ "custom.css",
+]
+
+exclude_patterns += ["README.md", "README_vllm0.7.md"]
+
+suppress_warnings = ["ref.duplicate", "ref.myst"]
diff --git a/code/RL_model/verl/verl_train/docs/data/transfer_queue.md b/code/RL_model/verl/verl_train/docs/data/transfer_queue.md
new file mode 100644
index 0000000000000000000000000000000000000000..2775034029b8064995421d10b9f6a26c1a0cecf3
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/data/transfer_queue.md
@@ -0,0 +1,290 @@
+# TransferQueue Data System
+
+Last updated: 01/07/2026.
+
+This doc introduce [TransferQueue](https://gitcode.com/Ascend/TransferQueue), an asynchronous streaming data management system for efficient post-training.
+
+🔥 **Now TransferQueue is formally open-sourced at [GitCode](https://gitcode.com/Ascend/TransferQueue). We will soon provide a [Github Mirror Repo](https://github.com/Ascend/TransferQueue) for community contributions. You are welcome to submit contributions or propose new ideas on either platform!**
+
+
+> At the mean time, the early development history remains accessible at: https://github.com/TransferQueue/TransferQueue.
+
+ Overview
+
+TransferQueue is a high-performance data storage and transfer module with panoramic data visibility and streaming scheduling capabilities, optimized for efficient dataflow in post-training workflows.
+
+
+
+
+
+TransferQueue offers **fine-grained, sample-level** data management and **load-balancing** (on the way) capabilities, serving as a data gateway that decouples explicit data dependencies across computational tasks. This enables a divide-and-conquer approach, significantly simplifies the algorithm controller design.
+
+
+
+
+
+ Updates
+
+ - **Dec 30, 2025**: **TransferQueue x verl** integration is tested with the DAPO algorithm at scale **(64 nodes, 1024 cards)**. It significantly optimizes host memory utilization and accelerates data transfers. Stay tuned for more details!
+ - **Dec 20, 2025**: 🔥 The official [tutorial](https://github.com/TransferQueue/TransferQueue/tree/main/tutorial) is released! Feel free to check it out.
+ - **Nov 10, 2025**: We disentangle the data retrieval logic from TransferQueueController [PR#101](https://github.com/TransferQueue/TransferQueue/pull/101). Now you can implement your own `Sampler` to control how to consume the data.
+ - **Nov 5, 2025**: We provide a `KVStorageManager` that simplifies the integration with KV-based storage backends [PR#96](https://github.com/TransferQueue/TransferQueue/pull/96). The first available KV-based backend is [Yuanrong](https://gitee.com/openeuler/yuanrong-datasystem).
+ - **Nov 4, 2025**: Data partition capability is available in [PR#98](https://github.com/TransferQueue/TransferQueue/pull/98). Now you can define logical data partitions to manage your train/val/test datasets.
+ - **Oct 25, 2025**: We make storage backends pluggable in [PR#66](https://github.com/TransferQueue/TransferQueue/pull/66). You can try to integrate your own storage backend with TransferQueue now!
+ - **Oct 21, 2025**: Official integration into verl is ready [verl/pulls/3649](https://github.com/volcengine/verl/pull/3649). Following PRs will optimize the single controller architecture by fully decoupling data & control flows.
+ - **July 22, 2025**: We present a series of Chinese blogs on Zhihu 1, 2.
+ - **July 21, 2025**: We started an RFC on verl community [verl/RFC#2662](https://github.com/volcengine/verl/discussions/2662).
+ - **July 2, 2025**: We publish the paper [AsyncFlow](https://arxiv.org/abs/2507.01663).
+
+ Components
+
+### Control Plane: Panoramic Data Management
+
+In the control plane, `TransferQueueController` tracks the **production status** and **consumption status** of each training sample as metadata. When all the required data fields are ready (i.e., written to the `TransferQueueStorageManager`), we know that this data sample can be consumed by downstream tasks.
+
+For consumption status, we record the consumption records for each computational task (e.g., `generate_sequences`, `compute_log_prob`, etc.). Therefore, even when different computation tasks require the same data field, they can consume the data independently without interfering with each other.
+
+
+
+
+
+To make the data retrieval process more customizable, we provide a `Sampler` class that allows users to define their own data retrieval and consumption logic. Refer to the [Customize](#customize) section for details.
+
+> In the future, we plan to support **load-balancing** and **dynamic batching** capabilities in the control plane. Additionally, we will support data management for disaggregated frameworks where each rank manages the data retrieval by itself, rather than coordinated by a single controller.
+
+### Data Plane: Distributed Data Storage
+
+In the data plane, we provide a pluggable design that enables TransferQueue to integrate with different storage backends according to user requirements.
+
+Specifically, we provide a `TransferQueueStorageManager` abstraction class that defines the core APIs as follows:
+
+- `async def put_data(self, data: TensorDict, metadata: BatchMeta) -> None`
+- `async def get_data(self, metadata: BatchMeta) -> TensorDict`
+- `async def clear_data(self, metadata: BatchMeta) -> None`
+
+This class encapsulates the core interaction logic within the TransferQueue system. You only need to write a simple subclass to integrate your own storage backend. Refer to the [Customize](#customize) section for details.
+
+Currently, we support the following storage backends:
+
+- SimpleStorageUnit: A basic CPU memory storage with minimal data format constraints and easy usability.
+- [Yuanrong](https://gitcode.com/openeuler/yuanrong-datasystem) (beta, [#PR107](https://github.com/TransferQueue/TransferQueue/pull/107), [#PR96](https://github.com/TransferQueue/TransferQueue/pull/96)): An Ascend native data system that provides hierarchical storage interfaces including HBM/DRAM/SSD.
+- [Mooncake Store](https://github.com/kvcache-ai/Mooncake) (alpha, [#PR162](https://github.com/TransferQueue/TransferQueue/pull/162)): A high-performance, KV-based hierarchical storage that supports RDMA transport between GPU and DRAM.
+- [Ray Direct Transport](https://docs.ray.io/en/master/ray-core/direct-transport.html) (alpha, [#PR167](https://github.com/TransferQueue/TransferQueue/pull/167)): Ray's new feature that allows Ray to store and pass objects directly between Ray actors.
+
+Among them, `SimpleStorageUnit` serves as our default storage backend, coordinated by the `AsyncSimpleStorageManager` class. Each storage unit can be deployed on a separate node, allowing for distributed data management.
+
+`SimpleStorageUnit` employs a 2D data structure as follows:
+
+- Each row corresponds to a training sample, assigned a unique index within the corresponding global batch.
+- Each column represents the input/output data fields for computational tasks.
+
+This data structure design is motivated by the computational characteristics of the post-training process, where each training sample is generated in a relayed manner across task pipelines. It provides an accurate addressing capability, which allows fine-grained, concurrent data read/write operations in a streaming manner.
+
+
+
+
+
+### User Interface: Asynchronous & Synchronous Client
+
+The interaction workflow of TransferQueue system is as follows:
+
+1. A process sends a read request to the `TransferQueueController`.
+2. `TransferQueueController` scans the production and consumption metadata for each sample (row), and dynamically assembles a micro-batch metadata according to the load-balancing policy. This mechanism enables sample-level data scheduling.
+3. The process retrieves the actual data from distributed storage units using the metadata provided by the controller.
+
+To simplify the usage of TransferQueue, we have encapsulated this process into `AsyncTransferQueueClient` and `TransferQueueClient`. These clients provide both asynchronous and synchronous interfaces for data transfer, allowing users to easily integrate TransferQueue into their framework.
+
+> In the future, we will provide a `StreamingDataLoader` interface for disaggregated frameworks as discussed in [issue#85](https://github.com/TransferQueue/TransferQueue/issues/85) and [verl/RFC#2662](https://github.com/volcengine/verl/discussions/2662). Leveraging this abstraction, each rank can automatically get its own data like `DataLoader` in PyTorch. The TransferQueue system will handle the underlying data scheduling and transfer logic caused by different parallelism strategies, significantly simplifying the design of disaggregated frameworks.
+
+🔥 Showcases
+
+### General Usage
+
+The primary interaction points are `AsyncTransferQueueClient` and `TransferQueueClient`, serving as the communication interface with the TransferQueue system.
+
+Core interfaces:
+
+- `(async_)get_meta(data_fields: list[str], batch_size:int, partition_id: str, mode: str, task_name:str, sampling_config: Optional[dict[str, Any]]) -> BatchMeta`
+- `(async_)get_data(metadata: BatchMeta) -> TensorDict`
+- `(async_)put(data: TensorDict, metadata: Optional[BatchMeta], partition_id: Optional[str])`
+- `(async_)clear_partition(partition_id: str)` and `(async_)clear_samples(metadata: BatchMeta)`
+
+**Refer to our [tutorial](https://github.com/TransferQueue/TransferQueue/tree/main/tutorial) for detailed examples.**
+
+
+### verl Example
+
+The primary motivation for integrating TransferQueue to verl now is to **alleviate the data transfer bottleneck of the single controller `RayPPOTrainer`**. Currently, all `DataProto` objects must be routed through `RayPPOTrainer`, resulting in a single point bottleneck of the whole post-training system.
+
+
+
+
+Leveraging TransferQueue, we separate experience data transfer from metadata dispatch by
+
+- Replacing `DataProto` with `BatchMeta` (metadata) and `TensorDict` (actual data) structures
+- Preserving verl's original Dispatch/Collect logic via BatchMeta (maintaining single-controller debuggability)
+- Accelerating data transfer by TransferQueue's distributed storage units
+
+
+
+
+You may refer to the [recipe](https://github.com/TransferQueue/TransferQueue/tree/dev/recipe/simple_use_case), where we mimic the verl usage in both async & sync scenarios. Official integration to verl is also available now at [verl/pulls/3649](https://github.com/volcengine/verl/pull/3649) (with subsequent PRs to further optimize the integration).
+
+
+### Use Python package
+```bash
+pip install TransferQueue
+```
+
+### Build wheel package from source code
+
+Follow these steps to build and install:
+1. Clone the source code from the GitHub repository
+ ```bash
+ git clone https://github.com/TransferQueue/TransferQueue/
+ cd TransferQueue
+ ```
+
+2. Install dependencies
+ ```bash
+ pip install -r requirements.txt
+ ```
+
+3. Build and install
+ ```bash
+ python -m build --wheel
+ pip install dist/*.whl
+ ```
+
+
+
+
+
+
+
+> Note: The above benchmark for TransferQueue is based on our naive `SimpleStorageUnit` backend. By introducing high-performance storage backends and optimizing serialization/deserialization, we expect to achieve even better performance. Warmly welcome contributions from the community!
+
+For detailed performance benchmarks, please refer to [this blog](https://www.yuque.com/haomingzi-lfse7/hlx5g0/tml8ke0zkgn6roey?singleDoc#).
+
+We also provide a [stress test report](https://www.yuque.com/haomingzi-lfse7/hlx5g0/ydbwgo5k2umaag78?singleDoc#) that demonstrates **768 concurrent clients writing 1.4 TB of data** into TransferQueue across 4 nodes. The system remains stable without any crashes or data loss, achieving 80% bandwidth.
+
+ 🛠️ Customize TransferQueue
+
+### Define your own data retrieval logic
+We provide a `BaseSampler` abstraction class, which defines the following interface:
+
+```python3
+@abstractmethod
+def sample(
+ self,
+ ready_indexes: list[int],
+ batch_size: int,
+ *args: Any,
+ **kwargs: Any,
+) -> tuple[list[int], list[int]]:
+ """Sample a batch of indices from the ready indices.
+
+ Args:
+ ready_indexes: List of global indices for which all required fields of the
+ corresponding samples have been produced, and the samples are not labeled as
+ consumed in the corresponding task.
+ batch_size: Number of samples to select
+ *args: Additional positional arguments for specific sampler implementations
+ **kwargs: Additional keyword arguments for specific sampler implementations
+
+ Returns:
+ List of sampled global indices of length batch_size
+ List of global indices of length batch_size that should be labeled as consumed
+ (will never be retrieved in the future)
+
+ Raises:
+ ValueError: If batch_size is invalid or ready_indexes is insufficient
+ """
+ raise NotImplementedError("Subclasses must implement sample")
+```
+
+In this design, we separate data retrieval and data consumption through the two return values, which enables us to easily control sample replacement. We have implemented two reference designs: `SequentialSampler` and `GRPOGroupNSampler`.
+
+The `Sampler` class or instance should be passed to the `TransferQueueController` during initialization. During each `get_meta` call, you can provide dynamic sampling parameters to the `Sampler`.
+
+```python3
+from transfer_queue import TransferQueueController, TransferQueueClient, GRPOGroupNSampler, process_zmq_server_info
+
+# Option 1: Pass the sampler class to the TransferQueueController
+controller = TransferQueueController.remote(GRPOGroupNSampler)
+
+# Option 2: Pass the sampler instance to the TransferQueueController (if you need custom configuration)
+your_own_sampler = YourOwnSampler(config)
+controller = TransferQueueController.remote(your_own_sampler)
+
+# Use the sampler
+batch_meta = client.get_meta(
+ data_fields=["input_ids", "attention_mask"],
+ batch_size=8,
+ partition_id="train_0",
+ task_name="generate_sequences",
+ sampling_config={"n_samples_per_prompt": 4} # Put the required sampling parameters here
+)
+```
+
+**Refer to [tutorial/04_custom_sampler.py](https://github.com/TransferQueue/TransferQueue/blob/main/tutorial/04_custom_sampler.py) for more details.**
+
+
+### How to integrate a new storage backend
+
+The data plane is organized as follows:
+```text
+ transfer_queue/
+ ├── storage/
+ │ ├── __init__.py
+ │ │── simple_backend.py # Default distributed storage backend (SimpleStorageUnit) by TQ
+ │ ├── managers/ # Managers are upper level interfaces that encapsulate the interaction logic with TQ system.
+ │ │ ├── __init__.py
+ │ │ ├──base.py # TransferQueueStorageManager, KVStorageManager
+ │ │ ├──simple_backend_manager.py # AsyncSimpleStorageManager
+ │ │ ├──yuanrong_manager.py # YuanrongStorageManager
+ │ │ ├──mooncake_manager.py # MooncakeStorageManager
+ │ │ └──factory.py # TransferQueueStorageManagerFactory
+ │ └── clients/ # Clients are lower level interfaces that directly manipulate the target storage backend.
+ │ │ ├── __init__.py
+ │ │ ├── base.py # TransferQueueStorageKVClient
+ │ │ ├── yuanrong_client.py # YuanrongStorageClient
+ │ │ ├── mooncake_client.py # MooncakeStorageClient
+ │ │ ├── ray_storage_client.py # RayStorageClient
+ │ │ └── factory.py # TransferQueueStorageClientFactory
+```
+
+To integrate TransferQueue with a custom storage backend, start by implementing a subclass that inherits from `TransferQueueStorageManager`. This subclass acts as an adapter between the TransferQueue system and the target storage backend. For KV-based storage backends, you can simply inherit from `KVStorageManager`, which can serve as the general manager for all KV-based backends.
+
+Distributed storage backends often come with their own native clients serving as the interface of the storage system. In such cases, a low-level adapter for this client can be written, following the examples provided in the `storage/clients` directory.
+
+Factory classes are provided for both `StorageManager` and `StorageClient` to facilitate easy integration. Adding necessary descriptions of required parameters in the factory class helps enhance the overall user experience.
+
+ ✏️ Contribution Guide
+
+**Contributions are warmly welcome!**
+
+New ideas, feature suggestions, and user experience feedback are all encouraged—feel free to submit issues or PRs. We will respond as soon as possible.
+
+We recommend using pre-commit for better code format.
+
+```bash
+# install pre-commit
+pip install pre-commit
+
+# run the following command in your repo folder, then fix the check before committing your code
+pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always
+```
+
+
+ Citation
+Please kindly cite our paper if you find this repo is useful:
+
+```bibtex
+@article{han2025asyncflow,
+ title={AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training},
+ author={Han, Zhenyu and You, Ansheng and Wang, Haibo and Luo, Kui and Yang, Guang and Shi, Wenqi and Chen, Menglong and Zhang, Sicheng and Lan, Zeshun and Deng, Chunshi and others},
+ journal={arXiv preprint arXiv:2507.01663},
+ year={2025}
+}
+```
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/examples/config.rst b/code/RL_model/verl/verl_train/docs/examples/config.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9909dd67581c3aa2d2ecb8b889e5955081cb24fc
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/examples/config.rst
@@ -0,0 +1,735 @@
+.. _config-explain-page:
+
+Config Explanation
+===================
+
+Last updated: 06/18/2025.
+
+ppo_trainer.yaml for RL FSDP Backend
+-------------------------------------
+
+Data
+~~~~
+
+.. code:: yaml
+
+ data:
+ tokenizer: null
+ train_files: ~/data/rlhf/gsm8k/train.parquet
+ val_files: ~/data/rlhf/gsm8k/test.parquet
+ train_max_samples: -1 # set to -1 to use full dataset
+ val_max_samples: -1 # set to -1 to use full dataset
+ prompt_key: prompt
+ max_prompt_length: 512
+ max_response_length: 512
+ train_batch_size: 1024
+ return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
+ return_raw_chat: False
+ return_full_prompt: False
+ shuffle: True
+ seed: 42
+ filter_overlong_prompts: False
+ filter_overlong_prompts_workers: 1
+ truncation: error
+ image_key: images
+ trust_remote_code: True
+ custom_cls:
+ path: null
+ name: null
+
+- ``data.train_files``: Training set parquet. Can be a list or a single
+ file. The program will read all files into memory, so it can't be too
+ large (< 100GB). The path can be either local path or HDFS path. For
+ HDFS path, we provide utils to download it to DRAM and convert the
+ HDFS path to local path.
+- ``data.val_files``: Validation parquet. Can be a list or a single
+ file.
+- ``data.train_max_samples``: Maximum number of samples to use from the
+ training dataset. Set to -1 to use the full dataset.
+- ``data.val_max_samples``: Maximum number of samples to use from the
+ validation dataset. Set to -1 to use the full dataset.
+- ``data.prompt_key``: The field in the dataset where the prompt is
+ located. Default is 'prompt'.
+- ``data.max_prompt_length``: Maximum prompt length. All prompts will be
+ left-padded to this length. An error will be reported if the length is
+ too long
+- ``data.max_response_length``: Maximum response length. Rollout in RL
+ algorithms (e.g. PPO) generates up to this length
+- ``data.train_batch_size``: Batch size sampled for one training
+ iteration of different RL algorithms.
+- ``data.return_raw_input_ids``: Whether to return the original
+ input_ids without adding chat template. This is mainly used to
+ accommodate situations where the reward model's chat template differs
+ from the policy. It needs to be decoded first, then apply the RM's
+ chat template. If using a model-based RM, and the policy and RM
+ chat_templates are different, this flag needs to be set
+- ``data.return_raw_chat``: Whether to return the original chat (prompt)
+ without applying chat template.
+- ``data.return_full_prompt``: Whether to return the full prompt with chat template
+- ``data.shuffle``: Whether to shuffle the data in the dataloader.
+- ``data.seed``: An integer seed to use when shuffling the data. If not set or set to
+ `null`, the data shuffling will not be seeded, resulting in a different data order on each run.
+- ``data.filter_overlong_prompts``: Default don't filter.
+- ``data.filter_overlong_prompts_workers``: For large-scale dataset, filtering
+ overlong prompts could be timeconsuming. You cat set the ``filter_overlong_prompts_workers``
+ to use multiprocessing for speed up. Default to 1.
+- ``data.truncation``: Truncate the input_ids or prompt length if they
+ exceed max_prompt_length. Default is 'error', not allow exceed the
+ max_prompt_length. The users should increase the max_prompt_length if
+ throwing the error. You can also set ``left``, ``right`` and ``middle``.
+ When ``middle`` is selected, the logic splits the allowed max length roughly in half
+ and keeps the head and tail of the sequence, effectively discarding the middle section.
+- ``data.image_key``: The field in the multi-modal dataset where the image is
+ located. Default is 'images'.
+- ``data.trust_remote_code``: If the remote tokenizer has python file, we can use this field to allow
+ using remote tokenizer. For example: moonshotai/Moonlight-16B-A3B-Instruct
+
+Customized Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Customized dataset extension is implemented for the SFT trainer and can be extended to other trainers with similar changes.
+
+.. code:: yaml
+
+ custom_cls:
+ path: null
+ name: null
+
+- ``data.custom_cls.path``: The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
+- ``data.custom_cls.name``: The name of the dataset class within the specified file.
+
+Actor/Rollout/Reference Policy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ actor_rollout_ref:
+ hybrid_engine: True
+ model:
+ path: ~/models/deepseek-llm-7b-chat
+ external_lib: null
+ override_config:
+ attn_implementation: flash_attention_2 # or eager, sdpa - attention implementation override
+ model_config: {}
+ moe_config: # Megatron only, can adjust moe configuration
+ freeze_moe_router: False # Megatron only, can freeze moe router (no grad)
+ enable_gradient_checkpointing: False
+ enable_activation_offload: False
+ trust_remote_code: False
+ use_remove_padding: False
+ actor:
+ strategy: fsdp # This is for backward-compatibility
+ ppo_mini_batch_size: 256
+ ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+ ppo_micro_batch_size_per_gpu: 8
+ use_dynamic_bsz: False
+ ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+ grad_clip: 1.0
+ clip_ratio: 0.2
+ entropy_coeff: 0.0
+ use_kl_loss: False # True for GRPO
+ # Rollout Correction (corrects distribution mismatch between rollout and training)
+ rollout_correction:
+ rollout_is: token # IS weights: token/sequence/null
+ rollout_is_threshold: 2.0 # Upper threshold for IS weights
+ rollout_rs: null # Rejection sampling: token/sequence/geometric/null
+ rollout_rs_threshold: null # RS upper threshold
+ rollout_rs_threshold_lower: null # RS lower threshold
+ rollout_token_veto_threshold: null # Per-token veto (null to disable)
+ use_torch_compile: True # False to disable torch compile
+ kl_loss_coef: 0.001 # for grpo
+ kl_loss_type: low_var_kl # for grpo
+ ppo_epochs: 1
+ data_loader_seed: null
+ shuffle: False
+ ulysses_sequence_parallel_size: 1 # sp size
+ optim:
+ lr: 1e-6
+ lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
+ lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
+ min_lr_ratio: 0.0 # only used with cosine lr scheduler, default to 0.0
+ num_cycles: 0.5 # only used with cosine lr scheduler, default to 0.5
+ lr_scheduler_type: constant # select from constant/cosine
+ total_training_steps: -1 # must be override by program
+ fsdp_config:
+ wrap_policy:
+ # transformer_layer_cls_to_wrap: None
+ min_num_params: 0
+ param_offload: False
+ optimizer_offload: False
+ fsdp_size: -1
+ checkpoint:
+ # What to include in saved checkpoints
+ # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+ save_contents: ['model', 'optimizer', 'extra']
+ # For more flexibility, you can specify the contents to load from the checkpoint.
+ load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
+ ref:
+ fsdp_config:
+ param_offload: False
+ wrap_policy:
+ # transformer_layer_cls_to_wrap: None
+ min_num_params: 0
+ log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+ log_prob_micro_batch_size_per_gpu: 16
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+ rollout:
+ name: vllm
+ temperature: 1.0
+ top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+ top_p: 1
+ prompt_length: ${data.max_prompt_length} # not use for opensource
+ response_length: ${data.max_response_length}
+ # for vllm rollout
+ dtype: bfloat16 # should align with FSDP
+ gpu_memory_utilization: 0.5
+ ignore_eos: False
+ enforce_eager: True
+ free_cache_engine: True
+ load_format: dummy_dtensor
+ tensor_model_parallel_size: 2
+ max_num_batched_tokens: 8192
+ max_num_seqs: 1024
+ log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+ log_prob_micro_batch_size_per_gpu: 16
+ log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+ log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+ # for hf rollout
+ do_sample: True
+ engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
+ vllm: {}
+ sglang: {}
+
+ n: 1 # for each prompt, sample n responses (i.e. num sample times). set it to values > 1 for grpo, rloo
+ calculate_log_probs: False # set to True for computing log probs via rollouts
+ val_kwargs:
+ # sampling parameters for validation
+ top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+ top_p: 1.0
+ temperature: 0
+ n: 1
+ do_sample: False # default eager for validation
+
+ agent:
+ custom_async_server: # Use custom async server implementation for rollout
+ path: null
+ name: null
+
+**Common config for actor, rollout and reference model**
+
+- ``actor_rollout_ref.hybrid_engine``: Whether it's a hybrid engine,
+ currently only supports hybrid engine
+- ``actor_rollout_ref.model.path``: Huggingface model path. This can be
+ either local path or HDFS path. For HDFS path, we provide utils to
+ download it to DRAM and convert the HDFS path to local path.
+- ``actor_rollout_ref.model.external_libs``: Additional Python packages
+ that need to be imported. Used to register models or tokenizers into
+ the Huggingface system.
+- ``actor_rollout_ref.model.override_config``: Used to override some of
+ the model's original configurations. Common overrides include:
+
+ - ``attn_implementation``: Override the attention implementation. Default is ``flash_attention_2``.
+ Supported values: ``flash_attention_2``, ``eager``, ``sdpa``. Use ``eager`` for debugging or
+ compatibility issues. See :ref:`attention-implementation-override` for detailed usage.
+
+- ``actor_rollout_ref.model.enable_gradient_checkpointing``: FSDP only, decide
+ Whether to enable gradient checkpointing for the actor,
+ Megatron uses recompute options in ``override_transformer_config`` to set this
+- ``actor_rollout_ref.model.enable_activation_offload``: Whether to enable
+ activation offloading for the actor
+- ``actor_rollout_ref.model.trust_remote_code``: Whether to enable loading
+ a remote code model
+- ``actor_rollout_ref.model.use_fused_kernels``: Whether to use fused
+ kernels in the model. If set to True, the following parameters will be
+ used.
+
+ - ``actor_rollout_ref.model.fused_kernel_options.impl_backend``: The
+ implementation backend for fused kernels. Options: "triton" or
+ "torch". Default is "torch".
+ While in megatron, we only support "triton" as the
+ implementation backend, so there is no need for this option.
+
+- ``actor_rollout_ref.model.use_remove_padding``: Whether to use remove
+ padding in the model. If set to True, the model will remove padding
+ tokens in the input_ids and response_ids. This helps a lot in improving model running efficiency.
+
+- ``actor_rollout_ref.model.tiled_mlp``: TiledMLP configuration for memory-efficient
+ MLP computation. Reduces peak memory by processing MLP forward/backward in tiles.
+ Only compatible with FSDP2 (requires ``actor_rollout_ref.actor.strategy=fsdp2``).
+
+ - ``actor_rollout_ref.model.tiled_mlp.enabled``: Whether to enable TiledMLP.
+ Default is False.
+ - ``actor_rollout_ref.model.tiled_mlp.num_shards``: Number of shards to split
+ the input. Higher values reduce peak memory but may slightly impact performance.
+ Default is 4.
+
+**Actor model**
+
+- ``actor_rollout_ref.actor.strategy``: fsdp or megatron. In this
+ example, we use fsdp backend.
+
+- ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split
+ into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO
+ updates. The ppo_mini_batch_size is a global num across all workers/gpus
+
+- ``actor_rollout_ref.actor.ppo_micro_batch_size``: [Will be deprecated, use ppo_micro_batch_size_per_gpu]
+ Similar to gradient accumulation, the micro_batch_size_per_gpu for one forward pass,
+ trading speed for GPU memory. The value represent the global view.
+
+- ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: Similar to gradient
+ accumulation, the micro_batch_size_per_gpu for one forward pass, trading speed
+ for GPU memory. The value represent the local num per gpu.
+
+- ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor
+ updates
+- ``actor_rollout_ref.actor.use_kl_loss``: to use kl loss in actor. When used, we are not applying KL in the reward function.
+
+- ``actor_rollout_ref.actor.clip_ratio``: PPO clip ratio
+
+- ``actor_rollout_ref.actor.use_torch_compile``: Whether to use torch compile in actor
+
+- ``actor_rollout_ref.actor.entropy_coeff``: The weight of entropy when
+ calculating PPO loss. The default value is changed to 0.0 since v0.3.x
+
+- ``actor_rollout_ref.actor.ppo_epochs``: Number of epochs for PPO
+ updates on one set of sampled data
+
+- ``actor_rollout_ref.actor.data_loader_seed``: From torch 2.6.0 Megatron backend can get wrong seed generated by pytorch
+ between cp ranks and cause misalignment between data on these ranks, so we shall manually set the seed to avoid hanging
+ issue. if ``actor_rollout_ref.actor.shuffle`` is not null, this must be set.
+
+- ``actor_rollout_ref.actor.shuffle``: Whether to shuffle data when
+ there are multiple epochs
+
+- ``actor_rollout_ref.actor.optim``: Actor's optimizer parameters
+
+- ``actor_rollout_ref.actor.fsdp_config``: FSDP config for actor
+ training
+
+ - ``wrap_policy``: FSDP wrap policy. By default, it uses Huggingface's
+ wrap policy, i.e., wrapping by DecoderLayer
+
+ - No need to set transformer_layer_cls_to_wrap, so we comment it.
+
+ - ``*_offload``: Whether to enable parameter, gradient and optimizer
+ offload
+
+ - Trading speed for GPU memory.
+
+- ``actor_rollout_ref.actor.use_kl_loss``: Whether to enable kl loss. Default is False.
+
+- ``actor_rollout_ref.actor.kl_loss_coef``: The coefficient of kl loss. Default is 0.001.
+
+- ``actor_rollout_ref.actor.kl_loss_type``: Support ``kl`` (``k1``), ``abs``, ``mse`` (``k2``), ``low_var_kl`` (``k3``) and ``full``. Appending ``+`` in the end (e.g., ``k1+`` and ``k3+``) would use straight-through to employ ``k2`` for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty()` in `core_algos.py `_ . See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+
+- ``actor_rollout_ref.actor.checkpoint``: The configurations of checkpoint function in actor
+
+ - ``save_contents``: The contents to save in the checkpoint. By default, we save model, optimizer and extra information in the checkpoint.
+ The extra information includes Rng states currently, FSDP supported lr_scheduler, and Megatron opt_param_scheduler will coming soon.
+ We do not store hf_model in checkpoint by default, but we provide a tool in ``scripts/model_merge.py`` to convert checkpoint format to hf format.
+
+ - ``load_contents``: The contents to load in the checkpoint, you can specify different checkpoint loading contents. By default, it is the same with ``save_checkpoint``.
+
+**Reference Model**
+
+Reference model will be enabled when ``actor.use_kl_loss`` or/and ``algorithm.use_kl_in_reward`` is/are True.
+
+- ``actor_rollout_ref.ref``: FSDP config same as actor. **For models
+ larger than 7B, it's recommended to turn on offload for ref by
+ default**
+
+- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
+ The batch size for one forward pass in the computation of ``ref_log_prob``. The value represent the global num.
+
+- ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``: The batch size
+ for one forward pass in the computation of ``ref_log_prob``. The value represent the local num per gpu.
+
+**Rollout Model**
+
+- ``actor_rollout_ref.rollout.name``: hf/vllm/sglang.
+
+- Rollout (Auto-regressive) parameters. The key should be equal to the
+ property name in vLLM's ``SamplingParams``.
+
+ - ``temperature``, ``top_k``, ``top_p`` and others: Sampling
+ parameters in ``SamplingParams``.
+
+- ``actor_rollout_ref.rollout.dtype``: Rollout model parameters type. This should be align with
+ the actor model parameter type in FSDP/Megatron backend.
+
+- ``actor_rollout_ref.rollout.gpu_memory_utilization``:
+
+ - For vLLM v0.7.0 and later: The fraction of **total** GPU memory to be used for the vLLM instance.
+ - For SGLang: Corresponding to ``mem_fraction_static``, the fraction of the free GPU memory used for **static** memory like model weights and KV cache.
+
+- ``actor_rollout_ref.rollout.tensor_model_parallel_size``: TP size for rollout. Only effective
+ for vllm.
+
+- ``actor_rollout_ref.rollout.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
+ The batch size for one forward pass in the computation of ``log_prob``. The value represent the global num.
+
+- ``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``: Micro batch size per gpu (The batch size for
+ one forward pass) for recalculating ``log_prob``. The value represent the local num per gpu.
+
+- ``actor_rollout_ref.rollout.do_sample``: Whether to sample during training rollout. If set to False, the rollout model
+ will perform greedy sampling.
+
+- ``actor_rollout_ref.rollout.val_kwargs```: Sampling parameters used specifically during validation.
+
+ - ``top_k``: Top-k sampling parameter. Default to -1 for vLLM rollout or 0 for HF rollout.
+ - ``top_p``: Top-p sampling parameter. Default is 1.0 (disabled).
+ - ``temperature``: Sampling temperature. Default is 0 (deterministic greedy).
+ - ``n``: Number of responses to generate during validation. Default is 1.
+ - ``do_sample``: Whether to use sampling during validation. Default is False for
+ deterministic outputs. When set to True, the rollout will use the ``actor_rollout_ref.rollout.val_kwargs`` parameters
+ (top_k, top_p, temperature) to control the sampling behavior.
+
+- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args, please refer vllm official doc for detail
+
+- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args, please refer sglang official doc for detail
+
+- ``actor_rollout_ref.rollout.ignore_eos``: Whether to ignore the EOS
+ token and continue generating tokens after the EOS token is generated.
+
+- ``actor_rollout_ref.rollout.free_cache_engine``: Offload the KVCache
+ after rollout generation stage. Default is True. When set to True,
+ for vllm v0.5.4 and v0.6.3, we need to disable the usage of CUDAGraph
+ (set ``enforce_eager`` to True.)
+
+- ``actor_rollout_ref.rollout.enforce_eager``: Whether to use CUDAGraph
+ in vLLM generation. Default set to True to disable CUDAGraph.
+
+- ``actor_rollout_ref.rollout.load_format``: Which weight loader to use
+ to load the actor model weights to the rollout model.
+
+ - ``auto``: Use Megatron weight loader.
+ - ``megatron``: Use Megatron weight loader. Deployed with Megatron
+ backend. The input model ``state_dict()`` is already partitioned
+ along TP dimension and already gathered along PP dimension. This
+ weight loader requires that the Rollout model and Actor model's
+ parameters shape and name should be identical.
+ - ``dtensor``: Default solution when using Huggingface weight loader.
+ Deployed with FSDP backend and the state_dict_type is
+ ``StateDictType.SHARDED_STATE_DICT``. Recommend to use this weight
+ loader
+ - ``hf``: Use Huggingface weight loader. Deployed with FSDP backend
+ and the state_dict_type is ``StateDictType.FULL_STATE_DICT``. This
+ solution doesn't need to rewrite the weight loader for each model
+ implemented in vLLM but it results in larger peak memory usage.
+ - ``dummy_hf``, ``dummy_megatron``, ``dummy_dtensor``: Random
+ initialization.
+
+.. note:: **NOTED**: In this config field, users only need to select from ``dummy_megatron``, ``dummy_dtensor``, ``dummy_hf`` for rollout initialization and our hybrid engine will select the corresponding weight loader (i.e., ``megatron``, ``dtensor``, ``hf``) during actor/rollout weight synchronization.
+
+
+Megatron Optimizer and Optimizer Parameter Scheduler
+____________________________________________________
+
+.. code:: yaml
+
+ optim:
+ optimizer: adam
+ lr: 1e-6
+ clip_grad: 1.0
+ total_training_steps: -1 # must be override by program
+ lr_warmup_init: 0.0 # initial learning rate for warmup, default to 0.0
+ lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
+ lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
+ lr_decay_steps: null
+ lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root
+ min_lr: 0.0 # minimum learning rate, default to 0.0
+ weight_decay: 0.01
+ weight_decay_incr_style: constant # select from constant/linear/cosine
+ lr_wsd_decay_style: exponential # select from constant/exponential/cosine
+ lr_wsd_decay_steps: null
+ use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
+
+
+Notice that there are some differences in APIs between Megatron optimizer and FSDP optimizer.
+
+- Megatron optimizer scheduler names the period after lr_warmup as lr_decay_steps, so the ``lr_scheduler_type`` actually means the style of lr decay after warmup.
+- Megatron optimizer also support weight decay decay mechanism
+- ``use_checkpoint_opt_param_scheduler`` determines whether to use the checkpoint optimizer parameter scheduler. If set to True, the optimizer parameter scheduler will be saved in the checkpoint and loaded from the checkpoint during resuming training.
+
+For learning rate decay, original Megatron pretrain default option of ``lr_decay_style`` is ``linear``,
+meaning that the learning rate will be linearly decayed from the initial learning rate to ``min_lr`` within the
+``lr_decay_steps``. However, in verl, to align with FSDP's default behavior, we set the default
+``lr_decay_style`` to ``constant``, meaning that the learning rate will be kept constant after the warmup stage.
+
+
+Critic Model
+~~~~~~~~~~~~
+
+Most parameters for Critic are similar to Actor Model.
+
+Reward Model
+~~~~~~~~~~~~
+
+.. code:: yaml
+
+ reward_model:
+ enable: False
+ model:
+ input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
+ path: ~/models/Anomy-RM-v0.1
+ external_lib: ${actor_rollout_ref.model.external_lib}
+ trust_remote_code: False
+ fsdp_config:
+ min_num_params: 0
+ param_offload: False
+ micro_batch_size_per_gpu: 16
+ max_length: null
+ reward_manager: naive
+
+- ``reward_model.enable``: Whether to enable reward model. If False, we
+ compute the reward only with the user-defined reward functions. In
+ GSM8K and Math examples, we disable reward model. For RLHF alignment
+ example using full_hh_rlhf, we utilize reward model to assess the
+ responses. If False, the following parameters are not effective.
+- ``reward_model.model``
+
+ - ``input_tokenizer``: Input tokenizer. If the reward model's chat
+ template is inconsistent with the policy, we need to first decode to
+ plaintext, then apply the rm's chat_template. Then score with RM. If
+ chat_templates are consistent, it can be set to null.
+ - ``path``: RM's HDFS path or local path. Note that RM only supports
+ AutoModelForSequenceClassification. Other model types need to define
+ their own RewardModelWorker and pass it from the code.
+ - ``trust_remote_code``: Whether to enable loading a remote code model,
+ default to False.
+- ``reward_model.reward_manager``: Reward Manager. This defines the mechanism
+ of computing rule-based reward and handling different reward sources. Default
+ is ``naive``. If all verification functions are multiprocessing-safe, the reward
+ manager can be set to ``prime`` for parallel verification.
+
+Customized Reward Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ custom_reward_function:
+ path: null
+ name: compute_score
+
+- ``custom_reward_function.path``: The path to the file containing your customized reward function. If not specified, pre-implemented reward functions will be used.
+- ``custom_reward_function.name`` (Optional) : The name of the reward function within the specified file. Default is 'compute_score'.
+
+Algorithm
+~~~~~~~~~
+
+.. code:: yaml
+
+ algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: gae
+ use_kl_in_reward: False
+ kl_penalty: kl # how to estimate kl divergence
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.005
+ horizon: 10000
+ target_kl: 0.1
+ # Rollout Correction
+ rollout_correction:
+ rollout_is: null # IS weights: token/sequence/null
+ rollout_is_threshold: 2.0 # Upper threshold for IS weights
+ rollout_rs: null # Rejection sampling: token/sequence/geometric/null
+ rollout_rs_threshold: null # RS upper threshold
+ rollout_rs_threshold_lower: null # RS lower threshold
+ rollout_token_veto_threshold: null # Per-token veto (null to disable)
+
+- ``gamma``: discount factor
+- ``lam``: Trade-off between bias and variance in the GAE estimator
+- ``adv_estimator``: Support ``gae``, ``grpo``, ``reinforce_plus_plus``, ``reinforce_plus_plus_baseline``, ``rloo``, ``rloo_vectorized``, ``grpo_vectorized``
+- ``use_kl_in_reward``: Whether to enable in-reward kl penalty. Default is False.
+- ``kl_penalty``: Support ``kl``, ``abs``, ``mse``, ``low_var_kl`` and ``full``. How to
+ calculate the kl divergence between actor and reference policy. For
+ specific options, refer to `kl_penalty()` in `core_algos.py `_ .
+- ``kl_ctrl``: Config for in-reward kl_penalty controller
+
+ - ``kl_coef``: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
+ - ``type``: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
+ - ``horizon`` and ``target_kl``: See source code of AdaptiveKLController for details.
+
+- ``rollout_correction``: Rollout Correction configuration (nested dict). Set to ``null`` to disable.
+ When enabled, contains:
+
+ - ``rollout_is``: IS weights aggregation level: ``token``, ``sequence``, or ``null`` to disable IS weights.
+ - ``rollout_is_threshold``: Upper threshold for IS weights (e.g., 2.0).
+ - ``rollout_rs``: Rejection sampling mode: ``token``, ``sequence``, ``geometric``, or ``null`` to disable RS.
+ - ``rollout_rs_threshold``: RS upper threshold.
+ - ``rollout_rs_threshold_lower``: RS lower threshold (null = auto-reciprocal).
+ - ``rollout_token_veto_threshold``: Per-token veto threshold for catastrophic outliers (null = disabled).
+
+ Note: Rollout Correction requires setting ``actor_rollout_ref.rollout.calculate_log_probs=True``.
+
+Trainer
+~~~~~~~
+
+.. code:: yaml
+
+ trainer:
+ total_epochs: 30
+ project_name: verl_examples
+ experiment_name: gsm8k
+ logger: ['console', 'wandb']
+ log_val_generations: 0
+ nnodes: 1
+ n_gpus_per_node: 8
+ save_freq: -1
+ val_before_train: True
+ test_freq: 2
+ critic_warmup: 0
+ default_hdfs_dir: null # hdfs checkpoint path
+ default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} # local checkpoint path
+ resume_mode: auto # or disable or resume_path if resume_from_path is set
+ resume_from_path: null
+ remove_previous_ckpt_in_save: False
+ del_local_ckpt_after_load: False
+ ray_wait_register_center_timeout: 300
+
+- ``trainer.total_epochs``: Number of epochs in training.
+- ``trainer.project_name``: For wandb, swanlab, mlflow
+- ``trainer.experiment_name``: For wandb, swanlab, mlflow
+- ``trainer.logger``: Support console and wandb, swanlab, mlflow, tensorboard, trackio
+- ``trainer.log_val_generations``: The number of logged generation during validation (default ``0``)
+- ``trainer.nnodes``: Number of nodes used in the training.
+- ``trainer.n_gpus_per_node``: Number of GPUs per node.
+- ``trainer.save_freq``: The frequency (by iteration) to save checkpoint
+ of the actor and critic model.
+- ``trainer.val_before_train``: Whether to run validation before training.
+- ``trainer.test_freq``: The validation frequency (by iteration).
+- ``trainer.critic_warmup``: The number of iteration to train the critic
+ model before actual policy learning.
+- ``trainer.resume_mode``: The mode of resuming training. Support
+ ``disable``, ``auto`` and ``resume_path``. If set to ``auto`` as default, the
+ program will automatically resume from the latest checkpoint in the
+ ``default_local_dir``. If set to ``resume_path``, the program will resume
+ from the path specified in ``resume_from_path``.
+- ``trainer.resume_from_path``: The path to resume training from. Only
+ effective when ``resume_mode`` is set to ``resume_path``.
+- ``trainer.remove_previous_ckpt_in_save``: Whether to remove previous
+ checkpoints in the save directory. Default is False.
+- ``trainer.del_local_ckpt_after_load``: Whether to delete local
+ checkpoints after loading them. Default is False.
+- ``trainer.ray_wait_register_center_timeout``: The timeout for waiting
+ for the ray register center to be ready. Default is 300 seconds.
+
+
+This figure illustrates how the configurations affect the training.
+
+https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA
+
+.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d
+
+
+evaluation.yaml
+---------------
+
+Data
+~~~~
+
+.. code:: yaml
+
+ data:
+ path: /tmp/math_Qwen2-7B-Instruct.parquet
+ prompt_key: prompt
+ response_key: responses
+ data_source_key: data_source
+ reward_model_key: reward_model
+
+- ``data.path``: Path to the dataset file (Parquet format).
+- ``data.prompt_key``: The field in the dataset where the prompt is located. Default is 'prompt'.
+- ``data.response_key``: The key holds the generated responses. This should be a list of strings representing the responses. Default is 'responses'.
+- ``data.data_source_key``: This is used to separate metric calculations for different data sources, ensuring that metrics are calculated independently for each source.
+- ``data.reward_model_key``: The key holds the reference answers. These reference answers typically serve as the ground truth or test cases for the task.
+
+Customized Reward Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: yaml
+
+ custom_reward_function:
+ path: null
+ name: compute_score
+
+- ``custom_reward_function.path``: The path to the file containing your customized reward function. If not specified, pre-implemented reward functions will be used.
+- ``custom_reward_function.name`` (Optional) : The name of the reward function within the specified file. Default is 'compute_score'.
+
+sft_trainer.yaml for SFT FSDP Backend
+--------------------------------------
+
+
+Optim
+~~~~~~~
+
+.. code:: yaml
+
+ optim:
+ optimizer: AdamW
+ optimizer_impl: torch.optim
+ lr: 1e-5
+ weight_decay: 0.01
+ lr_warmup_steps_ratio: 0.1
+ clip_grad: 1.0
+ lr_scheduler: cosine
+ override_optimizer_config: null
+
+- ``optimizer``: Optimizer class name (e.g., ``"AdamW"``, ``"AdamW8bit"``, ``"_AdamW"``). The class name as it appears in the module.
+- ``optimizer_impl``: Module path to import optimizer from (e.g., ``"torch.optim"``, ``"torchao.optim"``, ``"bitsandbytes.optim"``).
+- ``optim.lr``: Learning rate for the optimizer.
+- ``optim.weight_decay``: Weight decay for the optimizer.
+- ``optim.lr_warmup_steps_ratio``: Ratio of warmup steps to total training steps.
+- ``optim.clip_grad``: Gradient clipping value.
+- ``optim.lr_scheduler``: Learning rate scheduler type. Options:
+
+ - ``cosine``: Cosine learning rate scheduler with warmup (default).
+ - ``wsd``: Warmup-Stable-Decay scheduler that provides a stable learning rate phase between warmup and decay phases.
+
+- ``override_optimizer_config``: Dictionary of additional optimizer-specific keyword arguments. For example, to use ``torchao.optim``'s ``_AdamW`` with BF16 stochastic rounding: ``{"bf16_stochastic_round": true}``
+
+Model
+~~~~~~~~~~~~
+
+Most parameters for Model are similar to Reward Model.
+
+.. code:: yaml
+
+ model:
+ partial_pretrain: ~/models/gemma-1.1-7b-it
+ fsdp_config:
+ model_dtype: fp32
+ wrap_policy:
+ min_num_params: 0
+ cpu_offload: False
+ offload_params: False
+ external_lib: null
+ enable_gradient_checkpointing: False
+ trust_remote_code: False
+ lora_rank: 0
+ lora_alpha: 16
+ target_modules: all-linear
+ use_liger: False
+
+- ``partial_pretrain``: HDFS path or local path for the pretrained model.
+- ``fsdp_config``
+
+ - ``model_dtype``: Model parameters type, default to ``fp32``.
+ Support: ``bf16``, ``fp16``, ``fp32``.
+ - ``cpu_offload``: Whether to enable CPU offloading for FSDP. If True,
+ the offload_params will be used as argument.
+ - ``offload_params``: Whether to offload parameters to CPU
+ when not involved in computation. If True, then this offloads gradients
+ to CPU as well, meaning that the optimizer step runs on CPU.
+
+- ``lora_rank``: The rank of the LoRA model, default to 0. If ``lora_rank``>0,
+ we will train LoRA modules instead of tuning the full model.
+- ``lora_alpha``: The alpha parameter for LoRA scaling, default to 16.
+- ``target_modules``: The names of the modules to apply the adapter to,
+ default to ``all-linear``. See `peft docs `_ for detail.
+
+- ``use_liger``: Whether to enable Liger kernel, default to False. If True,
+ we apply Liger kernel to the model (depends on `liger-kernel`).
diff --git a/code/RL_model/verl/verl_train/docs/examples/gsm8k_example.rst b/code/RL_model/verl/verl_train/docs/examples/gsm8k_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bc56497be64e578c6623fc917e34d376457b3676
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/examples/gsm8k_example.rst
@@ -0,0 +1,190 @@
+GSM8K Example
+=============
+
+Last updated: 03/25/2025.
+
+Introduction
+------------
+
+In this example, we train an LLM to tackle the GSM8k task.
+
+Paper: https://arxiv.org/pdf/2110.14168
+
+Dataset: https://huggingface.co/datasets/openai/gsm8k
+
+Note that the original paper mainly focuses on training a verifier (a
+reward model) to solve math problems via Best-of-N sampling. In this
+example, we train an RLHF agent using a rule-based reward model.
+
+Dataset Introduction
+--------------------
+
+GSM8k is a math problem dataset. The prompt is an elementary school
+problem. The LLM model is required to answer the math problem.
+
+The training set contains 7473 samples and the test set contains 1319
+samples.
+
+**An example**
+
+Prompt
+
+ Katy makes coffee using teaspoons of sugar and cups of water in the
+ ratio of 7:13. If she used a total of 120 teaspoons of sugar and cups
+ of water, calculate the number of teaspoonfuls of sugar she used.
+
+Solution
+
+ The total ratio representing the ingredients she used to make the
+ coffee is 7+13 = <<7+13=20>>20 Since the fraction representing the
+ number of teaspoons she used is 7/20, she used 7/20\ *120 =
+ <<7/20*\ 120=42>>42 #### 42
+
+Step 1: Prepare dataset
+-----------------------
+
+.. code:: bash
+
+ cd examples/data_preprocess
+ python3 gsm8k.py --local_save_dir ~/data/gsm8k
+
+Step 2: Download Model
+----------------------
+
+There're three ways to prepare the model checkpoints for post-training:
+
+- Download the required models from huggingface or modelscope
+
+.. code:: bash
+
+ hf download deepseek-ai/deepseek-math-7b-instruct --local-dir ~/models/deepseek-math-7b-instruct --local-dir-use-symlinks False
+ # or
+ modelscope download --model deepseek-ai/deepseek-math-7b-instruct --local_dir ~/models/deepseek-math-7b-instruct
+
+- Already store your store model in the local directory or HDFS path.
+- Also, you can directly use the model name in huggingface (e.g.,
+ deepseek-ai/deepseek-math-7b-instruct) in
+ ``actor_rollout_ref.model.path`` and ``critic.model.path`` field in
+ the run script. You can also download models from modelscope by setting environmental variable ``VERL_USE_MODELSCOPE=True``.
+ See examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh for example.
+
+Noted that users should prepare checkpoints for actor, critic and reward
+model.
+
+[Optional] Step 3: SFT your Model
+---------------------------------
+
+We provide a SFT Trainer using PyTorch FSDP in
+`fsdp_sft_trainer.py `_.
+Users can customize their own SFT
+script using our FSDP SFT Trainer.
+
+We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft directory `_.
+
+.. code:: shell
+
+ set -x
+
+ torchrun -m verl.trainer.fsdp_sft_trainer \
+ data.train_files=$HOME/data/gsm8k/train.parquet \
+ data.val_files=$HOME/data/gsm8k/test.parquet \
+ data.prompt_key=question \
+ data.response_key=answer \
+ data.micro_batch_size_per_gpu=8 \
+ model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
+ trainer.project_name=gsm8k-sft \
+ trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
+ trainer.total_epochs=4 \
+ trainer.logger='["console","wandb"]'
+
+
+If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
+
+ .. code-block:: bash
+
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+ export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+
+
+Step 4: Perform PPO training with your model on GSM8K Dataset
+-------------------------------------------------------------
+
+- Prepare your own run.sh script. Here's an example for GSM8k dataset
+ and deepseek-llm-7b-chat model.
+- Users could replace the ``data.train_files`` ,\ ``data.val_files``,
+ ``actor_rollout_ref.model.path`` and ``critic.model.path`` based on
+ their environment.
+- See :doc:`config` for detailed explanation of each config field.
+
+**Reward Model/Function**
+
+We use a rule-based reward model. We force the model to produce a final
+answer following 4 “#” as shown in the solution. We extract the final
+answer from both the solution and model's output using regular
+expression matching. We compare them and assign a reward of 1 to correct
+answer, 0.1 to incorrect answer and 0 to no answer.
+
+**Training Script**
+
+The training script example for FSDP and Megatron-LM backend are stored in examples/ppo_trainer directory.
+
+.. code:: bash
+
+ cd ../ppo_trainer
+ bash run_deepseek7b_llm.sh
+
+The script of run_deepseek7b_llm.sh
+
+.. code:: bash
+
+ set -x
+
+ python3 -m verl.trainer.main_ppo \
+ data.train_files=$HOME/data/gsm8k/train.parquet \
+ data.val_files=$HOME/data/gsm8k/test.parquet \
+ data.train_batch_size=1024 \
+ data.max_prompt_length=512 \
+ data.max_response_length=512 \
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.model.use_remove_padding=True \
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
+ critic.optim.lr=1e-5 \
+ critic.model.use_remove_padding=True \
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+ critic.model.enable_gradient_checkpointing=True \
+ critic.ppo_micro_batch_size_per_gpu=32 \
+ critic.model.fsdp_config.param_offload=False \
+ critic.model.fsdp_config.optimizer_offload=False \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ trainer.critic_warmup=0 \
+ trainer.logger='["console","wandb"]' \
+ trainer.project_name='verl_example_gsm8k' \
+ trainer.experiment_name='deepseek_llm_7b_function_rm' \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=1 \
+ trainer.save_freq=-1 \
+ trainer.test_freq=1 \
+ trainer.total_epochs=15 $@
+
+
+If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
+
+ .. code-block:: bash
+
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+ export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+
+If you encounter any issues in using AMD GPUs running VeRL, feel free to contact me - `Yusheng Su `_.
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/examples/multi_modal_example.rst b/code/RL_model/verl/verl_train/docs/examples/multi_modal_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..844005b66eac5a8b0543d3e67a722c0c11293c95
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/examples/multi_modal_example.rst
@@ -0,0 +1,45 @@
+Multi-Modal Example Architecture
+=================================
+
+Last updated: 04/28/2025.
+
+Introduction
+------------
+
+Now, verl has supported multi-modal training. You can use fsdp and
+vllm/sglang to start a multi-modal RL task. Megatron supports is also
+on the way.
+
+Follow the steps below to quickly start a multi-modal RL task.
+
+Step 1: Prepare dataset
+-----------------------
+
+.. code:: python
+
+ # it will be saved in the $HOME/data/geo3k folder
+ python examples/data_preprocess/geo3k.py
+
+Step 2: Download Model
+----------------------
+
+.. code:: bash
+
+ # download the model from huggingface
+ python3 -c "import transformers; transformers.pipeline(model='Qwen/Qwen2.5-VL-7B-Instruct')"
+
+Step 3: Perform GRPO training with multi-modal model on Geo3K Dataset
+---------------------------------------------------------------------
+
+.. code:: bash
+
+ # run the task
+ bash examples/grpo_trainer/run_qwen2_5_vl-7b.sh
+
+
+
+
+
+
+
+
diff --git a/code/RL_model/verl/verl_train/docs/examples/ppo_code_architecture.rst b/code/RL_model/verl/verl_train/docs/examples/ppo_code_architecture.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94d62413a2a684385eae801281995d6a02f05b3a
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/examples/ppo_code_architecture.rst
@@ -0,0 +1,209 @@
+PPO Example Architecture
+========================
+
+Last updated: 02/17/2025.
+
+Let's start with the Proximal Policy Optimization algorithm, which is
+most widely used algorithm in LLM post-training.
+
+The main entry point of the PPO algorithm example is:
+`main_ppo.py `_.
+In this tutorial, we will go through the code architecture in `main_ppo.py `_.
+
+Define the data
+---------------
+
+Users need to preprocess and store the dataset in parquet files.
+And we implement `RLHFDataset` to load and tokenize the parquet files.
+
+For ``RLHFDataset`` (Default), at least 1 fields are required:
+
+- ``prompt``: Contains the string prompt
+
+We already provide some examples of processing the datasets to parquet
+files in `data_preprocess directory `_. Currently, we support
+preprocess of GSM8k, MATH, Hellasage, Full_hh_rlhf datasets. See :doc:`../preparation/prepare_data` for
+more information.
+
+Define the reward functions for different datasets
+--------------------------------------------------
+
+In this main entry point, the users only need to define their own reward
+function based on the datasets (or applications) utilized in PPO
+training.
+
+For example, we already provide reward functions for `GSM8k `_
+and `MATH `_
+datasets in the ``_select_rm_score_fn``. In the ``RewardManager``, we
+will compute the reward score based on the data_source to select
+corresponding reward functions. For some RLHF datasets (e.g.,
+full_hh_rlhf), the reward model is utilized to assess the responses
+without any reward functions. In this case, the ``RewardManager`` will
+return the ``rm_score`` computed by the reward model directly.
+
+See `reward functions `_ for detailed implementation.
+
+Define worker classes
+---------------------
+
+.. code:: python
+
+ if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: # for FSDP backend
+ assert config.critic.strategy in {"fsdp", "fsdp2"}
+ from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
+ from verl.single_controller.ray import RayWorkerGroup
+ ray_worker_group_cls = RayWorkerGroup
+
+ elif config.actor_rollout_ref.actor.strategy == 'megatron': # for Megatron backend
+ assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
+ from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
+ from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
+ ray_worker_group_cls = NVMegatronRayWorkerGroup # Ray worker class for Megatron-LM
+
+ else:
+ raise NotImplementedError
+
+ from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+
+ role_worker_mapping = {
+ Role.ActorRollout: ActorRolloutRefWorker,
+ Role.Critic: CriticWorker,
+ Role.RefPolicy: ActorRolloutRefWorker
+ }
+
+ global_pool_id = 'global_pool'
+ resource_pool_spec = {
+ global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+ }
+ mapping = {
+ Role.ActorRollout: global_pool_id,
+ Role.Critic: global_pool_id,
+ Role.RefPolicy: global_pool_id,
+ }
+
+Step 1: Construct the mapping between roles and workers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A role represents a group of workers in the same process. We have
+pre-defined several roles in `ray_trainer.py `_.
+
+.. code:: python
+
+ class Role(Enum):
+ """
+ To create more roles dynamically, you can subclass Role and add new members
+ """
+ Actor = 0 # This worker only has Actor
+ Rollout = 1 # This worker only has Rollout
+ ActorRollout = 2 # This worker has both actor and rollout, it's a HybridEngine
+ Critic = 3 # This worker only has critic
+ RefPolicy = 4 # This worker only has reference policy
+ RewardModel = 5 # This worker only has reward model
+ ActorRolloutRef = 6 # This worker contains actor, rollout and reference policy simultaneously
+
+Step 2: Define the worker class corresponding to this role
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- We have pre-implemented the ``ActorRolloutRefWorker``. Through
+ different configs, it can be a standalone actor, a standalone rollout,
+ an ActorRollout HybridEngine, or an ActorRolloutRef HybridEngine
+- We also pre-implemented workers for ``Actor``, ``Rollout``,
+ ``Critic``, ``Reward Model`` and ``Reference model`` on two different
+ backend: PyTorch FSDP
+ and Megatron-LM.
+ See `FSDP Workers `_
+ and `Megatron-LM Workers `_
+ for more information.
+
+Step 3: Define resource pool id and resource pool spec
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Resource pool is a division of global GPU resources,
+ ``resource_pool_spec`` is a dict, mapping from id to # of GPUs
+
+ - In the above example, we defined a global resource pool:
+ global_pool_id, and then put all roles on this one resource pool
+ with all the GPUs in this post-training task. This refers to
+ *co-locate* placement where all the models share the same set of
+ GPUs.
+
+- See resource pool and placement for advance usage.
+
+Defining reward model/function
+------------------------------
+
+.. code:: python
+
+ # we should adopt a multi-source reward function here
+ # - for rule-based rm, we directly call a reward score
+ # - for model-based rm, we call a model
+ # - for code related prompt, we send to a sandbox if there are test cases
+ # - finally, we combine all the rewards together
+ # - The reward type depends on the tag of the data
+ if config.reward_model.enable:
+ from verl.workers.fsdp_workers import RewardModelWorker
+ role_worker_mapping[Role.RewardModel] = RewardModelWorker
+ mapping[Role.RewardModel] = global_pool_id
+
+ reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0)
+
+ # Note that we always use function-based RM for validation
+ val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
+
+ resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+
+Since not all tasks use model-based RM, users need to define here
+whether it's a model-based RM or a function-based RM
+
+- If it's a model-based RM, directly add the ``RewardModel`` role in the
+ resource mapping and add it to the resource pool mapping.
+
+ - Note that the pre-defined ``RewardModelWorker`` only supports models
+ with the structure of huggingface
+ ``AutoModelForSequenceClassification``. If it's not this model, you
+ need to define your own RewardModelWorker in `FSDP Workers `_
+ and `Megatron-LM Workers `_.
+
+- If it's a function-based RM, the users are required to classified the
+ reward function for each datasets.
+
+.. code:: python
+
+ def _select_rm_score_fn(data_source):
+ if data_source == 'openai/gsm8k':
+ return gsm8k.compute_score
+ elif data_source == 'lighteval/MATH':
+ return math.compute_score
+ else:
+ raise NotImplementedError
+
+See reward functions implemented in `directory `_
+for more information.
+
+Define, init and run the PPO Trainer
+------------------------------------
+
+.. code:: python
+
+ trainer = RayPPOTrainer(config=config,
+ tokenizer=tokenizer,
+ role_worker_mapping=role_worker_mapping,
+ resource_pool_manager=resource_pool_manager,
+ ray_worker_group_cls=ray_worker_group_cls,
+ reward_fn=reward_fn,
+ val_reward_fn=val_reward_fn)
+ trainer.init_workers()
+ trainer.fit()
+
+- We first initialize the ``RayPPOTrainer`` with user config, tokenizer
+ and all the above worker mapping, resource pool, worker group and
+ reward functions
+- We first call the ``trainer.init_workers()`` to initialize the models
+ on the allocated GPUs (in the resource pool)
+- The actual PPO training will be executed in ``trainer.fit()``
+
+verl can be easily extended to other RL algorithms by reusing the Ray
+model workers, resource pool and reward functions. See :doc:`extension<../advance/dpo_extension>` for
+more information.
+
+Details of the ``RayPPOTrainer`` is discussed in :doc:`Ray Trainer<../workers/ray_trainer>`.
diff --git a/code/RL_model/verl/verl_train/docs/examples/sandbox_fusion_example.rst b/code/RL_model/verl/verl_train/docs/examples/sandbox_fusion_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f3359efda2e14fa6d869b9af21060d6053ac112e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/examples/sandbox_fusion_example.rst
@@ -0,0 +1,54 @@
+Sandbox Fusion Example
+============================
+
+Last updated: 06/27/2025.
+
+Introduction
+------------
+
+Sandbox Fusion is a remote code sandbox service that provides a secure environment for running and evaluating code generated by Large Language Models (LLMs). This example demonstrates how to train an LLM and use Sandbox Fusion to verify generated code, enhancing both security and performance.
+
+By leveraging a remote code sandbox service with greater CPU resources for concurrent code verification, you can reduce the reward stage time by 10-30%, depending on the quality of the generated code.
+
+Step 1: Prepare the Dataset
+---------------------------
+
+We use the Eurus-2-RL-Data dataset for training. This dataset combines math and code questions, making it suitable for LLM training tasks. You can download it from HuggingFace: `Eurus-2-RL-Data Dataset `_.
+
+Step 2: Set Up the Sandbox Fusion Service
+-----------------------------------------
+
+Sandbox Fusion is a remote code sandbox service designed to securely run and evaluate LLM-generated code. To use it:
+
+1. **Access Full Documentation**: For detailed setup instructions, refer to the `Sandbox Fusion Documentation `_.
+2. **Deploy the Service**: Choose one of the following deployment methods:
+
+ - **Local Deployment**: Follow the guide `here `_.
+ - **FaaS Instance (Volcengine)**: Create an instance using the `Volcengine Documentation `_.
+
+After deployment, you will receive an API endpoint in the format: ``https:///run_code``.
+
+Step 3: Configure the Training Script
+-------------------------------------
+
+To integrate Sandbox Fusion into your training script, configure the following parameters:
+
+**Key Settings for Sandbox Fusion**
+
+- ``reward_model.sandbox_fusion.url=''``: Enable Sandbox Fusion by specifying the API endpoint (must end with ``/run_code``).
+- ``reward_model.sandbox_fusion.max_concurrent=256``: Set the maximum number of concurrent API requests to the Sandbox Fusion service.
+- ``reward_model.sandbox_fusion.memory_limit_mb=1024``: Set the memory limit (in MB) for each sandbox instance. Defaults to 1024MB if not specified.
+
+**Additional Optimization**
+
+To further reduce code verification time, enable parallel processing with:
+
+- ``reward_model.reward_manager=prime``: The Prime reward manager verifies code across multiple subprocesses concurrently.
+
+**Example Script**
+
+For a practical implementation, refer to the example script:
+
+``examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh``
+
+Once you’ve set your API endpoint in the script, you can start the training job.
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/examples/skypilot_examples.rst b/code/RL_model/verl/verl_train/docs/examples/skypilot_examples.rst
new file mode 100644
index 0000000000000000000000000000000000000000..de91781be63290be6da5bf4b62624addb6446a2d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/examples/skypilot_examples.rst
@@ -0,0 +1,146 @@
+SkyPilot Examples
+=================
+
+Last updated: 09/04/2025.
+
+This guide provides examples of running VERL reinforcement learning training on Kubernetes clusters or cloud platforms with GPU nodes using `SkyPilot `_.
+
+Installation and Configuration
+-------------------------------
+
+Step 1: Install SkyPilot
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Choose the installation based on your target platform:
+
+.. code-block:: bash
+
+ # For Kubernetes only
+ pip install "skypilot[kubernetes]"
+
+ # For AWS
+ pip install "skypilot[aws]"
+
+ # For Google Cloud Platform
+ pip install "skypilot[gcp]"
+
+ # For Azure
+ pip install "skypilot[azure]"
+
+ # For multiple platforms
+ pip install "skypilot[kubernetes,aws,gcp,azure]"
+
+Step 2: Configure Your Platform
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See https://docs.skypilot.co/en/latest/getting-started/installation.html
+
+Step 3: Set Up Environment Variables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Export necessary API keys for experiment tracking:
+
+.. code-block:: bash
+
+ # For Weights & Biases tracking
+ export WANDB_API_KEY="your-wandb-api-key"
+
+ # For HuggingFace gated models (if needed)
+ export HF_TOKEN="your-huggingface-token"
+
+Examples
+--------
+
+All example configurations are available in the `examples/skypilot/ `_ directory on GitHub. See the `README `_ for additional details.
+
+PPO Training
+~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ sky launch -c verl-ppo verl-ppo.yaml --secret WANDB_API_KEY -y
+
+Runs PPO training on GSM8K dataset using Qwen2.5-0.5B-Instruct model across 2 nodes with H100 GPUs. Based on examples in ``examples/ppo_trainer/``.
+
+`View verl-ppo.yaml on GitHub `_
+
+GRPO Training
+~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ sky launch -c verl-grpo verl-grpo.yaml --secret WANDB_API_KEY -y
+
+Runs GRPO (Group Relative Policy Optimization) training on MATH dataset using Qwen2.5-7B-Instruct model. Memory-optimized configuration for 2 nodes. Based on examples in ``examples/grpo_trainer/``.
+
+`View verl-grpo.yaml on GitHub `_
+
+Multi-turn Tool Usage Training
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ sky launch -c verl-multiturn verl-multiturn-tools.yaml \
+ --secret WANDB_API_KEY --secret HF_TOKEN -y
+
+Single-node training with 8xH100 GPUs for multi-turn tool usage with Qwen2.5-3B-Instruct. Includes tool and interaction configurations for GSM8K. Based on examples in ``examples/sglang_multiturn/`` but uses vLLM instead of sglang.
+
+`View verl-multiturn-tools.yaml on GitHub `_
+
+Configuration
+-------------
+
+The example YAML files are pre-configured with:
+
+- **Infrastructure**: Kubernetes clusters (``infra: k8s``) - can be changed to ``infra: aws`` or ``infra: gcp``, etc.
+- **Docker Image**: VERL's official Docker image with CUDA 12.6 support
+- **Setup**: Automatically clones and installs VERL from source
+- **Datasets**: Downloads required datasets during setup phase
+- **Ray Cluster**: Configures distributed training across nodes
+- **Logging**: Supports Weights & Biases via ``--secret WANDB_API_KEY``
+- **Models**: Supports gated HuggingFace models via ``--secret HF_TOKEN``
+
+Launch Command Options
+----------------------
+
+- ``-c ``: Cluster name for managing the job
+- ``--secret KEY``: Pass secrets for API keys (can be used multiple times)
+- ``-y``: Skip confirmation prompt
+
+Monitoring Your Jobs
+--------------------
+
+Check Cluster Status
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ sky status
+
+View Logs
+~~~~~~~~~
+
+.. code-block:: bash
+
+ sky logs verl-ppo # View logs for the PPO job
+
+SSH into Head Node
+~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ ssh verl-ppo
+
+Access Ray Dashboard
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ sky status --endpoint 8265 verl-ppo # Get dashboard URL
+
+Stop a Cluster
+~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ sky down verl-ppo
diff --git a/code/RL_model/verl/verl_train/docs/faq/faq.rst b/code/RL_model/verl/verl_train/docs/faq/faq.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aa150d65b1da895da0ae4b6780513be501cc0b52
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/faq/faq.rst
@@ -0,0 +1,209 @@
+Frequently Asked Questions
+====================================
+
+Last updated: 09/24/2025.
+
+Ray related
+------------
+
+How to add breakpoint for debugging with distributed Ray?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Please checkout the official debugging guide from Ray: https://docs.ray.io/en/latest/ray-observability/ray-distributed-debugger.html
+
+
+"Unable to register worker with raylet"
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The cause of this issue is due to some system setting, e.g., SLURM added some constraints on how the CPUs are shared on a node.
+While `ray.init()` tries to launch as many worker processes as the number of CPU cores of the machine,
+some constraints of SLURM restricts the `core-workers` seeing the `raylet` process, leading to the problem.
+
+To fix this issue, you can set the config term ``ray_init.num_cpus`` to a number allowed by your system.
+
+Distributed training
+------------------------
+
+How to run multi-node post-training with Ray?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can start a ray cluster and submit a ray job, following the official guide from Ray: https://docs.ray.io/en/latest/ray-core/starting-ray.html
+
+Then in the configuration, set the ``trainer.nnode`` config to the number of machines for your job.
+
+How to use verl on a Slurm-managed cluster?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Ray provides users with `this `_ official
+tutorial to start a Ray cluster on top of Slurm. We have verified the :doc:`GSM8K example<../examples/gsm8k_example>`
+on a Slurm cluster under a multi-node setting with the following steps.
+
+1. [Optional] If your cluster support `Apptainer or Singularity `_ and you wish
+to use it, convert verl's Docker image to an Apptainer image. Alternatively, set up the environment with the package
+manager available on your cluster or use other container runtimes (e.g. through `Slurm's OCI support `_) available to you.
+
+.. code:: bash
+
+ apptainer pull /your/dest/dir/vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3.sif docker://verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+
+2. Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints.
+
+3. Modify `examples/slurm/ray_on_slurm.slurm `_ with your cluster's own information.
+
+4. Submit the job script to the Slurm cluster with `sbatch`.
+
+Please note that Slurm cluster setup may vary. If you encounter any issues, please refer to Ray's
+`Slurm user guide `_ for common caveats.
+
+If you changed Slurm resource specifications, please make sure to update the environment variables in the job script if necessary.
+
+
+Install related
+------------------------
+
+NotImplementedError: TensorDict does not support membership checks with the `in` keyword.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Detail error information:
+
+.. code:: bash
+
+ NotImplementedError: TensorDict does not support membership checks with the `in` keyword. If you want to check if a particular key is in your TensorDict, please use `key in tensordict.keys()` instead.
+
+Cause of the problem: There is no suitable version of tensordict package for the linux-arm64 platform. The confirmation method is as follows:
+
+.. code:: bash
+
+ pip install tensordict==0.6.2
+
+Output example:
+
+.. code:: bash
+
+ ERROR: Could not find a version that satisfies the requirement tensordict==0.6.2 (from versions: 0.0.1a0, 0.0.1b0, 0.0.1rc0, 0.0.2a0, 0.0.2b0, 0.0.3, 0.1.0, 0.1.1, 0.1.2, 0.8.0, 0.8.1, 0.8.2, 0.8.3)
+ ERROR: No matching distribution found for tensordict==0.6.2
+
+Solution 1st:
+ Install tensordict from source code:
+
+.. code:: bash
+
+ pip uninstall tensordict
+ git clone https://github.com/pytorch/tensordict.git
+ cd tensordict/
+ git checkout v0.6.2
+ python setup.py develop
+ pip install -v -e .
+
+Solution 2nd:
+ Temperally modify the error takeplace codes: tensordict_var -> tensordict_var.keys()
+
+
+Illegal memory access
+---------------------------------
+
+If you encounter the error message like ``CUDA error: an illegal memory access was encountered`` during rollout, please check the vLLM documentation for troubleshooting steps specific to your vLLM version.
+
+Checkpoints
+------------------------
+
+If you want to convert the model checkpoint into huggingface safetensor format, please refer to ``verl/model_merger``.
+
+
+Triton ``compile_module_from_src`` error
+------------------------------------------------
+
+If you encounter triton compilation error similar to the stacktrace below, please set the ``use_torch_compile`` flag according to
+https://verl.readthedocs.io/en/latest/examples/config.html to disable just-in-time compilation for fused kernels.
+
+.. code:: bash
+
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in
+ return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 338, in run
+ return self.fn.run(*args, **kwargs)
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 607, in run
+ device = driver.active.get_current_device()
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 23, in __getattr__
+ self._initialize_obj()
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 20, in _initialize_obj
+ self._obj = self._init_fn()
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 9, in _create_driver
+ return actives[0]()
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 371, in __init__
+ self.utils = CudaUtils() # TODO: make static
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 80, in __init__
+ mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 57, in compile_module_from_src
+ so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
+ File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/build.py", line 48, in _build
+ ret = subprocess.check_call(cc_cmd)
+ File "/data/lbh/conda_envs/verl/lib/python3.10/subprocess.py", line 369, in check_call
+ raise CalledProcessError(retcode, cmd)
+
+What is the meaning of train batch size, mini batch size, and micro batch size?
+------------------------------------------------------------------------------------------
+
+This figure illustrates the relationship between different batch size configurations.
+
+https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA
+
+.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d
+
+How to generate ray timeline to analyse performance of a training job?
+------------------------------------------------------------------------------------------
+
+To generate the ray timeline file, you can set the config term ``ray_init.timeline_json_file`` to a json file path.
+For example:
+
+.. code:: bash
+
+ ray_init.timeline_json_file=/tmp/ray_timeline.json
+
+The file will be generated in the specified path at the end of a training job.
+You can use tools like chrome://tracing or the Perfetto UI and view the ray timeline file.
+
+This figure shows the ray timeline file generated by from a training job on 1 node with 4 GPUs
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray_timeline.png?raw=true
+
+How to set proxy only for wandb?
+------------------------------------------------------------------------------------------
+
+If you need a proxy to access wandb, you can add below config in your training job script.
+Comparing to using global https_proxy env variable, this approach won't mess up other http requests, such as ChatCompletionScheduler.
+
+.. code:: bash
+
+ +trainer.wandb_proxy=http://
+
+Missmatch between inference and training sequence (high actor/grad_norm)
+------------------------------------------------------------------------------------------
+
+If you encounter the issue of actor/grad_norm metric continuously increasing during training, it might be caused by a significant precision mismatching between the inference engine and training. You can use the following parameter to confirm this:
+
+.. code:: bash
+
+ actor_rollout_ref.rollout.calculate_log_probs=True
+
+This parameter will add metrics like training/rollout_probs_diff_mean , which can be used to verify if there is a precision difference between inference and training.
+
+Under normal circumstances, the value of training/rollout_probs_diff_mean should be below 0.005. If you observe this value to be higher than 0.01, it indicates a precision issue from the inference engine.
+The precision issue is known to occur under the following conditions:
+
+1. Using non-Hopper architecture GPUs, such as A100, L20, B200, etc.
+
+2. Using vLLM `with issue 22103 `_ as the inference engine.
+
+3. The input and output texts are long, for example, in multi-turn scenarios using reasioning models like Qwen3 for RL training.
+
+If all three conditions above are met and you observe that rollout_probs_diff_mean is too high, it is recommended to add the following parameter to resolve the precision issue:
+
+.. code:: bash
+
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_cascade_attn=True
+
+The root cause of this issue is a bug in the flash attention used by vLLM. Although it has been fixed, the fix has not yet been released in the latest version of vLLM (v0.10.2).
+For a more detailed explanation of this issue, please refer to `Fix LSE output error in FA2 kv-split `_.
+
+Until vLLM releases a new version with this fix, it is recommended to use the configuration above to disable cascade attention as a workaround.
diff --git a/code/RL_model/verl/verl_train/docs/hybrid_flow.rst b/code/RL_model/verl/verl_train/docs/hybrid_flow.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3aa5a4a97cb88e564babc11392899149338a5b49
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/hybrid_flow.rst
@@ -0,0 +1,266 @@
+=========================================================
+HybridFlow Programming Guide
+=========================================================
+
+Last updated: 06/02/2025.
+
+.. _vermouth: https://github.com/vermouth1992
+
+Author: `Chi Zhang `_
+
+verl is an open source implementation of the paper `HybridFlow `_ [1]_. In this section, we will introduce the basic concepts of HybridFlow, the motivation and how to program with verl APIs.
+
+Motivation and Design
+------------------------
+We use dataflow to represent RL systems. [4]_.
+
+DataFlow
+~~~~~~~~~~~~~~~~~~~~
+
+Dataflow is an abstraction of computations. Neural Network training is a typical dataflow. It can be represented by computational graph.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/dataflow.jpeg?raw=true
+ :alt: The dataflow graph from CS231n 2024 lecture 4
+
+This figure [2]_ represents the computation graph of a polynomial function followed by a sigmoid function. In the data flow of neural network computation, each node represents an operator, and each edge represents the direction of forward/backward propagation. The computation graph determines the architecture of the neural network.
+
+RL as a dataflow problem
+++++++++++++++++++++++++++++++++++++++++++++++
+
+Reinforcement learning (RL) training can also be represented as a dataflow. Below is the dataflow graph that represents the PPO algorithm used in RLHF [3]_:
+
+.. image:: https://picx.zhimg.com/70/v2-cb8ab5ee946a105aab6a563e92682ffa_1440w.avis?source=172ae18b&biz_tag=Post
+ :alt: PPO dataflow graph, credit to Zhihu 低级炼丹师
+
+However, the dataflow of RL has fundamental differences compared with dataflow of neural network training as follows:
+
++--------------------------+--------------------------------------------------+---------------------+
+| Workload | Node | Edge |
++--------------------------+--------------------------------------------------+---------------------+
+| Neural Network Training | Operator (+/-/matmul/softmax) | Tensor movement |
++--------------------------+--------------------------------------------------+---------------------+
+| Reinforcement Learning | High-level operators (rollout/model forward) | Data Movement |
++--------------------------+--------------------------------------------------+---------------------+
+
+In the case of tabular reinforcement learning, each operator is a simple scalar math operation (e.g., bellman update). In deep reinforcement learning(DRL), each operator is a high-level neural network computation such as model inference/update. This makes RL a two-level dataflow problem:
+
+- Control flow: defines how the high-level operators are executed (e.g., In PPO, we first perform rollout. Then, we perform advantage computation. Finally, we perform training). It expresses the **core logics of RL algorithms**.
+- Computation flow: defines the dataflow of **neural network computation** (e.g., model forward/backward/optimizer).
+
+
+Design Choices
+~~~~~~~~~~~~~~~~~~~~
+The model size used in DRL before the LLM era is typically small. Thus, the high-level neural network computation can be done in a single process. This enables embedding the computation flow inside the control flow as a single process.
+
+However, in the LLM era, the computation flow (e.g., training neural network) becomes a multi-process program. This naturally leads to two design choices:
+
+1. Convert the control flow into a multi-process program as well. Then colocate with computation flow (unified multi-controller)
+
+- Advantages:
+
+ - Achieves the **optimal performance** under fixed computation flow and control flow as the communication overhead in both training and data transfer is minimized.
+
+- Disadvantages:
+
+ - The computation and/or control flow is **hard to reuse** from software perspective as computation code is coupled with specific controller code. For example, the training loop of PPO is generic. Say we have an PPO training flow implemented with a specific computation flow such as FSDP. Neither the control flow or computation flow can be reused if we want to switch the computation flow from FSDP to Megatron, due to the coupling of control and computation flows.
+ - Requires more efforts from the user under flexible and dynamic control flows, due to the multi-process nature of the program.
+
+2. Separate the flows: single process for the control flow and multi-process for computation flow
+
+- Advantages:
+
+ - The computation flow defined elsewhere can be **easily reused** after the decoupling.
+ - The controller runs on a single process. Implementing a new RL algorithm with a **different control flow is simple and easy**.
+
+- Disadvantages:
+
+ - Additional **data communication overhead** each time the controller process and computatation processes interact. The data has to be sent back and forth.
+
+In verl, the latter strategy with separate control flow and computation flow is adopted. verl is designed to decouple the control flow of RL algorithms, and the implementation of computation engines.
+
+Overall Execution Diagram
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Below is a simplified diagram denoting the execution of a reinforcement learning job. In the diagram, the controller runs on a single process, while the generator/actor workers, critic workers run on multiple processes, placed with specific resource groups. For rollout, the controller passes the data to the generator to perform sample generation. When the rollout is done, the data is passed back to controller for the next step of the algorithm. Similar execution is done for other workers. With the hybrid controller design, the data flow and computation is decoupled to provide both efficiency in computation and flexibility in defining algorithm training loops.
+
+.. figure:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/driver_worker.png?raw=true
+ :alt: The execution diagram
+
+Codebase walkthrough (PPO)
+------------------------------------------------
+
+Entry function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Code: https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py
+
+In this file, we define a remote function `main_task` that serves as the controller (driver) process as shown in the above figure. We also define a ``RewardManager``, where users can customize their reward function based on the data source in the dataset. Note that `RewardManager` should return the final token-level reward that is optimized by RL algorithms. Note that users can combine model-based rewards and rule-based rewards.
+The ``main_task`` constructs a RayPPOTrainer instance and launch the fit. Note that ``main_task`` **runs as a single process**.
+
+We highly recommend that the ``main_task`` is NOT scheduled on the head of the ray cluster because ``main_task`` will consume a lot of memory but the head usually contains very few resources.
+
+Ray trainer
+~~~~~~~~~~~~~~~~~~~~
+Code: https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py
+
+The RayPPOTrainer manages
+
+- Worker and WorkerGroup construction
+- Runs the main loop of PPO algorithm
+
+Note that, the fit function of RayPPOTrainer **runs as a single process**.
+
+Worker and WorkerGroup construction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Each workerGroup manages a list of workers that runs remotely. Note that the worker group runs in the process of its constructor.
+Each worker inside the WorkerGroup runs on a GPU. The worker group serves as a proxy for the controller process to interact with a list of workers, in order to perform certain computations. **In order to do so, we have to bind the methods of the worker into the method of the WorkerGroup and define the data dispatch and data collection**. This is done via simple decoration that will be introduced in the Worker definition section.
+
+For example, in PPO, we define 3 worker groups:
+
+- ActorRolloutRef: manages actor, rollout and reference policy. ActorRolloutRefWorker can be instantiated as a single actor, a single rollout, a single reference policy, a combined actor/rollout or a combined actor/rollout/ref. This design is aimed for the maximum code reuse in various scenarios. The reason for colocating actor and rollout is for fast weight transfer using nccl. The reason for coloating actor and reference is to implement an efficient lora PPO as the reference policy is simply the base model of PPO in lora. The colocation is done via ``verl.single_controller.ray.base.create_colocated_worker_cls``, where it creates a single ray remote class exposing all class methods from these roles.
+- Critic: manages the critic model
+- Reward: manages the reward model
+
+The worker group will be constructed on the resource pool it designates. The resource pool is a set of GPUs in the ray cluster.
+
+Worker definition
+~~~~~~~~~~~~~~~~~~~~
+
+.. _ActorRolloutRefWorker: https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py
+
+We take `ActorRolloutRefWorker `_ for an example.
+The APIs it should expose to the controller process are:
+
+- init_model: build the underlying model
+- generate_sequences: given prompts, generate responses
+- compute_log_prob: compute the log-probability of a generated sequence using actor
+- compute_ref_log_prob: compute the log-probability of a generated sequence using reference policy
+- save_checkpoint: save the checkpoint
+
+Note that these methods are defined in the worker that can only be invoked via remote calls. For example, if the controller process wants to initialize the model, it has to call
+
+.. code-block:: python
+
+ for worker in actor_rollout_ref_wg:
+ worker.init_model.remote()
+
+If the controller process wants to generate sequences, it has to call
+
+.. code-block:: python
+
+ data = xxx
+ # split the data into dp chunks
+ data_dp_lst = data.split(dp_size)
+ output_dp_lst = []
+ for i, worker in enumerate(actor_rollout_ref_wg):
+ output_future = worker.generate_sequences.remote(data_dp_lst[i])
+ output_dp_lst.append(output_future)
+ output = torch.cat(ray.get(output_dp_lst), dim=0)
+
+We observe that controller process calling worker group methods in general can be divided into 3 parts:
+
+- Split the data into data parallel sizes
+- Dispatch the corresponding data into each worker
+- Collect and concatenate the data when the computation finishes
+
+In verl, we design a syntax sugar to encapsulate the 3 processes into a single call from the controller process.
+
+.. code-block:: python
+
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def generate_sequences(data):
+ ...
+
+ # on the driver
+ output = actor_rollout_ref_wg.generate_sequences(data)
+
+We decorate the method of the worker with a ``register`` that explicitly defines how the input data should be split and dispatched to each worker, and how the output data should be collected and concatenated by the controller. For example, ``Dispatch.DP_COMPUTE_PROTO`` splits the input data into dp chunks, dispatch each data to each worker, collect the output and concatenate the results. Note that this function requires the input and output to be a DataProto defined here (https://github.com/volcengine/verl/blob/main/verl/protocol.py).
+
+
+PPO main loop
+~~~~~~~~~~~~~~~~~~~~
+With the aforementioned APIs, we can implement the main loop of PPO as if it is a single process program
+
+.. code-block:: python
+
+ for prompt in dataloader:
+ output = actor_rollout_ref_wg.generate_sequences(prompt)
+ old_log_prob = actor_rollout_ref_wg.compute_log_prob(output)
+ ref_log_prob = actor_rollout_ref_wg.compute_ref_log_prob(output)
+ values = critic_wg.compute_values(output)
+ rewards = reward_wg.compute_scores(output)
+ # compute_advantages is running directly on the control process
+ advantages = compute_advantages(values, rewards)
+ output = output.union(old_log_prob)
+ output = output.union(ref_log_prob)
+ output = output.union(values)
+ output = output.union(rewards)
+ output = output.union(advantages)
+ # update actor
+ actor_rollout_ref_wg.update_actor(output)
+ critic.update_critic(output)
+
+Takeaways
+~~~~~~~~~~~~~~~~~~~~
+- This programming paradigm enables users to use different computation backend without modification of the control process.
+- This programming paradigm enables flexible placement (by changing the mapping of WorkerGroup and ResourcePool) without modification of the control process.
+
+Repository organization
+------------------------------------------------
+
+Important code files in the repository are organized as below:
+
+.. code-block:: bash
+
+ verl # the verl package
+ trainer
+ main_ppo.py # the entrypoint for RL training
+ ppo
+ ray_trainer.py # the training loop for RL algorithms such as PPO
+ fsdp_sft_trainer.py # the SFT trainer with FSDP backend
+ config
+ generation.yaml # configuration template for rollout
+ ppo_trainer.yaml # configuration template for the RL trainer
+ workers
+ protocol.py # the interface of DataProto
+ fsdp_workers.py # the FSDP worker interfaces: ActorRolloutRefWorker, CriticWorker, RewardModelWorker
+ megatron_workers.py # the Megatron worker interfaces: ActorRolloutRefWorker, CriticWorker, RewardModelWorker
+ actor
+ dp_actor.py # data parallel actor with FSDP backend
+ megatron_actor.py # nD parallel actor with Megatron backend
+ critic
+ dp_critic.py # data parallel critic with FSDP backend
+ megatron_critic.py # nD parallel critic with FSDP backend
+ reward_model
+ megatron
+ reward_model.py # reward model with Megatron backend
+ rollout
+ vllm
+ vllm_rollout.py # rollout with vllm backend
+ hf_rollout.py # rollout with huggingface TGI backend
+ sharding_manager
+ fsdp_ulysses.py # data and model resharding when using FSDP + ulysses
+ fsdp_vllm.py # data and model resharding when using FSDP + ulysses + vllm
+ megatron_vllm.py # data and model resharding when using Megatron + vllm
+ utils
+ dataset # datasets for SFT/RM/RL
+ reward_score # function based reward
+ gsm8k.py # reward function for gsm8k dataset
+ math.py # reward function for math dataset
+ seqlen_balancing.py # the sequence balance optimization
+ models
+ llama # Megatron implementation for llama, deepseek, mistral, etc
+ transformers # ulysses integration with transformer models such as llama, qwen, etc
+ weight_loader_registery.py # registry of weight loaders for loading hf ckpt into Megatron
+ third_party
+ vllm # adaptor for vllm's usage in RL
+ vllm_spmd # vllm >= v0.7 adaptor
+ examples # example scripts
+ tests # integration and unit tests
+ .github # the configuration of continuous integration tests
+
+
+.. [1] HybridFlow: A Flexible and Efficient RLHF Framework: https://arxiv.org/abs/2409.19256v2
+.. [2] Data flow graph credit to CS231n 2024 lecture 4: https://cs231n.stanford.edu/slides/2024/lecture_4.pdf
+.. [3] PPO dataflow graph credit to 低级炼丹师 from Zhihu: https://zhuanlan.zhihu.com/p/635757674
+.. [4] RLFlow
diff --git a/code/RL_model/verl/verl_train/docs/index.rst b/code/RL_model/verl/verl_train/docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e1bc7a04e276b27c84b113172acfe44f627bc97
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/index.rst
@@ -0,0 +1,218 @@
+Welcome to verl's documentation!
+================================================
+
+verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the `HybridFlow `_ paper.
+
+verl is flexible and easy to use with:
+
+- **Easy extension of diverse RL algorithms**: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
+
+- **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks.
+
+- **Flexible device mapping and parallelism**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
+
+- Ready integration with popular HuggingFace models
+
+
+verl is fast with:
+
+- **State-of-the-art throughput**: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput.
+
+- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
+
+--------------------------------------------
+
+.. _Contents:
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Quickstart
+
+ start/install
+ start/quickstart
+ start/multinode
+ start/ray_debug_tutorial
+ start/more_resources
+ start/agentic_rl
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Programming guide
+
+ hybrid_flow
+ single_controller
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Data Preparation
+
+ preparation/prepare_data
+ preparation/reward_function
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Configurations
+
+ examples/config
+
+.. toctree::
+ :maxdepth: 1
+ :caption: PPO Example
+
+ examples/ppo_code_architecture
+ examples/gsm8k_example
+ examples/multi_modal_example
+ examples/skypilot_examples
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Algorithms
+
+ algo/ppo.md
+ algo/grpo.md
+ algo/collabllm.md
+ algo/dapo.md
+ algo/spin.md
+ algo/sppo.md
+ algo/entropy.md
+ algo/opo.md
+ algo/baseline.md
+ algo/gpg.md
+ algo/rollout_corr.md
+ algo/rollout_corr_math.md
+ algo/otb.md
+
+.. toctree::
+ :maxdepth: 1
+ :caption: PPO Trainer and Workers
+
+ workers/ray_trainer
+ workers/fsdp_workers
+ workers/megatron_workers
+ workers/sglang_worker
+ workers/trtllm_worker
+ workers/model_engine
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Performance Tuning Guide
+
+ perf/dpsk.md
+ perf/best_practices
+ perf/perf_tuning
+ README_vllm0.8.md
+ perf/device_tuning
+ perf/verl_profiler_system.md
+ perf/nsight_profiling.md
+ perf/torch_profiling.md
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Adding new models
+
+ advance/fsdp_extension
+ advance/megatron_extension
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Advanced Features
+
+ advance/checkpoint
+ advance/rope
+ advance/attention_implementation
+ advance/ppo_lora.rst
+ sglang_multiturn/multiturn.rst
+ sglang_multiturn/interaction_system.rst
+ advance/placement
+ advance/dpo_extension
+ examples/sandbox_fusion_example
+ advance/rollout_trace.rst
+ advance/rollout_skip.rst
+ advance/one_step_off
+ advance/agent_loop
+ advance/reward_loop
+ advance/fully_async
+ data/transfer_queue.md
+ advance/grafana_prometheus.md
+ advance/fp8.md
+ advance/async-on-policy-distill
+ advance/mtp.md
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Hardware Support
+
+ amd_tutorial/amd_build_dockerfile_page.rst
+ amd_tutorial/amd_vllm_page.rst
+ ascend_tutorial/ascend_quick_start.rst
+ ascend_tutorial/ascend_consistency.rst
+ ascend_tutorial/ascend_profiling_zh.rst
+ ascend_tutorial/ascend_profiling_en.rst
+ ascend_tutorial/dockerfile_build_guidance.rst
+ ascend_tutorial/ascend_sglang_quick_start.rst
+ ascend_tutorial/examples/gspo_optimization_practice.md
+ ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
+ ascend_tutorial/examples/ascend_sglang_best_practices.rst
+
+.. toctree::
+ :maxdepth: 1
+ :caption: API References
+
+ api/data
+ api/single_controller.rst
+ api/trainer.rst
+ api/utils.rst
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Blog
+
+ blog/v0.7.md
+
+.. toctree::
+ :maxdepth: 2
+ :caption: FAQ
+
+ faq/faq
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Development Notes
+
+ sglang_multiturn/sandbox_fusion.rst
+
+Contribution
+-------------
+
+verl is free software; you can redistribute it and/or modify it under the terms
+of the Apache License 2.0. We welcome contributions.
+Join us on `GitHub `_, `Slack `_ and `Wechat `_ for discussions.
+
+Contributions from the community are welcome! Please check out our `project roadmap `_ and `good first issues `_ to see where you can contribute.
+
+Code Linting and Formatting
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We use pre-commit to help improve code quality. To initialize pre-commit, run:
+
+.. code-block:: bash
+
+ pip install pre-commit
+ pre-commit install
+
+To resolve CI errors locally, you can also manually run pre-commit by:
+
+.. code-block:: bash
+
+ pre-commit run
+
+Adding CI tests
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+If possible, please add CI test(s) for your new feature:
+
+1. Find the most relevant workflow yml file, which usually corresponds to a ``hydra`` default config (e.g. ``ppo_trainer``, ``ppo_megatron_trainer``, ``sft_trainer``, etc).
+2. Add related path patterns to the ``paths`` section if not already included.
+3. Minimize the workload of the test script(s) (see existing scripts for examples).
+
+We are HIRING! Send us an `email `_ if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
diff --git a/code/RL_model/verl/verl_train/docs/perf/best_practices.rst b/code/RL_model/verl/verl_train/docs/perf/best_practices.rst
new file mode 100644
index 0000000000000000000000000000000000000000..69d8286710ad01d04cf60366a52b398f3dfb7b6d
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/perf/best_practices.rst
@@ -0,0 +1,242 @@
+Verl LLM Best Practices (DAPO + Qwen3-235B)
+===========================================
+
+Last updated: 11/03/2025.
+
+Purpose
+-------
+
+This guide uses DAPO training on Qwen3-235B as a concrete example. We unpack every parameter that appears in the optimization objective, map it to Verl configuration entries, and share field-tested recommendations so you can derive sensible settings for your own workloads.
+
+.. note::
+
+ 1. The guide only covers the subset of parameters required to reproduce the DAPO experiments discussed here. For the full list, refer to the ``config`` components in the Verl source tree: https://github.com/volcengine/verl/tree/main/verl/trainer/config
+ 2. PPO and GRPO introduce KL-constrained policies. We therefore include that setup in the explanations below. You can treat all configurations mentioned here as a DAPO pipeline augmented with a KL penalty.
+
+Optimization Objectives
+-----------------------
+
+DAPO objective
+~~~~~~~~~~~~~~
+
+.. math::
+
+ \begin{aligned}
+ \mathcal{J}_{\mathrm{DAPO}}(\theta)= & \mathbb{E}_{(q, a) \sim \mathcal{D},\left\{o_i\right\}_{i=1}^G \sim \pi_{\theta_{\text {old }}}(\cdot \mid q)} \
+ {\left[\frac{1}{\sum_{i=1}^G\left|o_i\right|} \sum_{i=1}^G \sum_{t=1}^{\left|o_i\right|} \min \left(r_{i, t}(\theta) \hat{A}_{i, t}, \operatorname{clip}\left(r_{i, t}(\theta), 1-\varepsilon_{\text {low }}, 1+\varepsilon_{\text {high }}\right) \hat{A}_{i, t}\right)\right] } \\
+ \end{aligned}
+
+.. math::
+ \text { s.t. } \quad 0<\mid\left\{o_i \mid \text { is_equivalent }\left(a, o_i\right)\right\} \mid 2 * model_parameters`` (bf16/fp16). Increase TP gradually to expand KV cache capacity while watching communication cost—especially once TP > 8.
+ - ``actor_rollout_ref.rollout.temperature`` / ``top_p`` / ``top_k``:
+ Sampling knobs for rollout. Keep enough randomness; ``temperature=1.0``, ``top_p=1.0``, ``top_k=-1`` are good defaults.
+ - ``actor_rollout_ref.rollout.val_kwargs.temperature`` / ``top_p`` / ``top_k`` / ``do_sample`` / ``n``:
+ Sampling options for validation. Set ``temperature > 0`` to prevent repetitive thinking chains. For small test sets (e.g., AIME24) raise ``n`` (64 is a common choice) to reduce variance. A practical starting point is ``temperature=1.0``, ``top_p=0.7``, ``top_k=-1``, ``do_sample=True``, ``n=1`` and then increase ``n`` as needed.
+ - ``+actor_rollout_ref.rollout.engine_kwargs.vllm.*`` / ``+actor_rollout_ref.rollout.engine_kwargs.sglang.*``:
+ Extra backend options injected via the ``+`` syntax. Consult backend docs for exact semantics. Some switches (for example ``pipeline_parallel_size``) may not be supported yet; when TP=32, ``enable_expert_parallel=True`` can even slow down DeepSeek-V3 rollout, so benchmark carefully.
+
+:math:`\pi_\theta`
+ - ``data.train_batch_size``:
+ Total batch size per training iteration. Each rollout produces ``train_batch_size * n`` samples. Larger values reduce the number of rollouts but increase off-policy drift.
+ - ``actor_rollout_ref.actor.ppo_mini_batch_size``:
+ Mini-batch size per optimization step. Tune it the same way you would for standard deep learning workloads.
+ - ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``:
+ Samples processed per forward pass on one GPU group (a Megatron group contains TP * PP * CP GPUs). Keep it ≤ ``ppo_mini_batch_size`` and as large as memory allows.
+ - ``actor_rollout_ref.actor.use_dynamic_bsz``:
+ Enable dynamic batch sizing to adapt to sequence length and improve throughput.
+ - ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``:
+ Maximum tokens per GPU when computing log probabilities under dynamic batching. Set it to at least a multiple of ``max_prompt_length + max_response_length`` to prevent truncation.
+ - Megatron parallelism parameters (``pipeline_model_parallel_size`` / ``tensor_model_parallel_size`` / ``expert_model_parallel_size`` / ``expert_tensor_parallel_size`` / ``context_parallel_size``):
+ Balance PP/TP/EP/ETP/CP to match memory and network constraints. In bf16/fp16, each parameter consumes roughly ``2 / TP`` bytes; if you keep FP32 master weights or skip optimizer offload, reserve another 4–8 bytes for Adam. Activations scale with ``micro_batch_size × sequence_length × hidden_size`` and can be mitigated with gradient checkpointing, dynamic batches, or offload. Prefer increasing TP first, add PP when necessary, extend sequence capacity with CP, align EP/ETP with TP for MoE models, and keep DP minimal on constrained clusters while combining with offload. Always align the setup with hardware topology and communication cost.
+ - ``actor_rollout_ref.model.use_fused_kernels``:
+ Enable Verl’s fused kernels for supported models to squeeze out additional performance.
+
+:math:`\hat{A}_{i,t}`
+ - ``algorithm.adv_estimator``:
+ Advantage estimator. Set to ``grpo`` for DAPO/GRPO.
+
+:math:`R_i`
+ - ``reward_model.reward_manager``:
+ Reward aggregation strategy. Use ``dapo`` for DAPO and ``naive`` for GRPO.
+
+:math:`D_{KL}`
+ - ``algorithm.use_kl_in_reward``:
+ Whether to add a KL term to the reward. ``True`` for PPO, ``False`` for GRPO and DAPO.
+ - ``actor_rollout_ref.actor.use_kl_loss``:
+ Whether to include a KL loss term. ``False`` for PPO, ``True`` for GRPO, ``False`` for DAPO.
+
+:math:`\beta`
+ - ``actor_rollout_ref.actor.kl_loss_coef``:
+ Weight of the KL loss. Start around 0.001. Larger values curb reward hacking but reduce exploration.
+ - ``algorithm.kl_ctrl.kl_coef``:
+ KL coefficient applied within the reward. Adjust to match your tolerance for divergence.
+
+:math:`\pi_{old}`
+ - ``actor_rollout_ref.rollout.log_prob_use_dynamic_bsz``:
+ Enable dynamic batching when the old policy computes log-probabilities. Recommended.
+
+:math:`\pi_{ref}`
+ - ``actor_rollout_ref.ref.log_prob_use_dynamic_bsz``:
+ Enable dynamic batching for the reference policy. Recommended.
+ - Reference Megatron parallelism:
+ Keep ``pipeline_model_parallel_size``, ``tensor_model_parallel_size``, ``expert_model_parallel_size``, ``expert_tensor_parallel_size``, and ``context_parallel_size`` in sync with the actor.
+ - ``actor_rollout_ref.ref.megatron.param_offload``:
+ Offload reference parameters to CPU when the actor does so. Even without gradients or optimizer states, parity helps with capacity planning.
+
+:math:`o_i` / :math:`|o_i|`
+ - ``actor_rollout_ref.actor.loss_agg_mode``:
+ Loss aggregation mode. Token-level ``token-mean`` matches the recommendations from Dr.GRPO and DAPO; use ``seq-mean-token-mean`` to reproduce the original GRPO behavior.
+
+:math:`\pi_\theta(o_{i,t} \mid q_i,o_{i,`_
+ - `SimonHuang `_
+
+1.5B
+~~~
+
+.. list-table::
+ :widths: auto
+ :header-rows: 1
+
+ * - Tag
+ - Model
+ - Task
+ - Resource
+ - MaxBatch
+ - Train
+ - Infer
+ - Link
+ - Contributor
+ * - MIN
+ - Qwen2.5-1.5B
+ - GRPO-LoRA
+ - 1*H100
+ - 128
+ - fsdp
+ - vllm0.8.3
+ - `qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh `_
+ - `SimonHuang `_
+
+3B
+~~~
+
+.. list-table::
+ :widths: auto
+ :header-rows: 1
+
+ * - Tag
+ - Model
+ - Task
+ - Resource
+ - MaxBatch
+ - Train
+ - Infer
+ - Link
+ - Contributor
+ * - MIN
+ - Qwen2.5-3B
+ - GRPO-LoRA
+ - 1*H100
+ - 62
+ - fsdp
+ - vllm0.8.3
+ - `qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh `_
+ - `SimonHuang `_
+
+7B
+~~~
+
+.. list-table::
+ :widths: auto
+ :header-rows: 1
+
+ * - Tag
+ - Model
+ - Task
+ - Resource
+ - MaxBatch
+ - Train
+ - Infer
+ - Link
+ - Contributor
+ * - MIN
+ - Qwen2-7B
+ - GRPO
+ - 2*H800
+ - \
+ - fsdp
+ - vllm0.8.2
+ - `qwen2-7b_grpo_2_h800_fsdp_vllm `_
+ - `Xiangyongan `_
+ * - MIN
+ - Qwen2.5-7B
+ - GRPO-LoRA
+ - 1*H100
+ - 16
+ - fsdp
+ - vllm0.8.3
+ - `qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh `_
+ - `SimonHuang `_
+
+14B
+~~~
+
+.. list-table::
+ :widths: auto
+ :header-rows: 1
+
+ * - Tag
+ - Model
+ - Task
+ - Resource
+ - MaxBatch
+ - Train
+ - Infer
+ - Link
+ - Contributor
+ * - MIN
+ - Qwen2-14B
+ - GRPO
+ - 4*H800
+ - \
+ - fsdp
+ - vllm0.8.2
+ - `qwen2-14b_grpo_4_h800_fsdp_vllm `_
+ - `Xiangyongan `_
+ * - MIN
+ - Qwen2.5-14B
+ - GRPO-LoRA
+ - 2*H100
+ - 116
+ - fsdp
+ - vllm0.8.3
+ - `qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh `_
+ - `SimonHuang `_
+
+32B
+~~~
+
+.. list-table::
+ :widths: auto
+ :header-rows: 1
+
+ * - Tag
+ - Model
+ - Task
+ - Resource
+ - MaxBatch
+ - Train
+ - Infer
+ - Link
+ - Contributor
+ * - MIN
+ - Qwen2-32B
+ - GRPO
+ - 8*H20
+ - \
+ - megatron
+ - vllm0.8.2
+ - `qwen2-32b_grpo_8_h20_megatron_vllm `_
+ - `Xiangyongan `_
+ * - MIN
+ - Qwen2.5-32B
+ - GRPO-LoRA
+ - 4*H100
+ - 180
+ - fsdp
+ - vllm0.8.3
+ - `qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh `_
+ - `SimonHuang `_
+
+70B
+~~~
+
+.. list-table::
+ :widths: auto
+ :header-rows: 1
+
+ * - Tag
+ - Model
+ - Task
+ - Resource
+ - MaxBatch
+ - Train
+ - Infer
+ - Link
+ - Contributor
+ * - MIN
+ - Qwen2-70B
+ - GRPO
+ - 32*H20
+ - \
+ - fsdp
+ - vllm0.8.2
+ - `qwen2-70b_grpo_32_h20_fsdp_vllm `_
+ - `Xiangyongan `_
+ * - MIN
+ - Qwen2-70B
+ - GRPO
+ - 32*H800
+ - \
+ - fsdp
+ - vllm0.8.3
+ - `qwen2-70b_grpo_32_h800_fsdp_vllm `_
+ - `Xiangyongan `_
+ * - MIN
+ - Qwen2.5-72B
+ - GRPO-LoRA
+ - 8*H100
+ - 176
+ - fsdp
+ - vllm0.8.3
+ - `qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh `_
+ - `SimonHuang `_
+
+405B
+~~~~
+
+.. table::
+ :widths: auto
+
+ ====== ====== ====== ======== ======== ====== ====== ======
+ tag model task resource MaxBatch train infer link
+ ====== ====== ====== ======== ======== ====== ====== ======
+ \ \ \ \ \ \ \
+ ====== ====== ====== ======== ======== ====== ====== ======
+
+671B
+~~~~
+
+.. table::
+ :widths: auto
+
+ ====== ====== ====== ======== ======== ====== ====== ======
+ tag model task resource MaxBatch train infer link
+ ====== ====== ====== ======== ======== ====== ====== ======
+ \ \ \ \ \ \ \
+ ====== ====== ====== ======== ======== ====== ====== ======
diff --git a/code/RL_model/verl/verl_train/docs/perf/dpsk.md b/code/RL_model/verl/verl_train/docs/perf/dpsk.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ea5bd196c3a63cc8d5e06189eb8dc92400136ce
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/perf/dpsk.md
@@ -0,0 +1,88 @@
+# Training DeepSeek 671b
+
+Last updated: 08/20/2025.
+
+verl integrates Megatron to support large MoE models such as `Qwen3-235B-A22B` and `deepseek-ai/DeepSeek-V3`. This is an ongoing community effort.
+
+In the journey the community added the following features and optimizations that enable verl with larger models:
+- per tensor weight resharding between rollout and training
+- context parallelism and expert parallelism enabled via megatron
+- dynamic batch size (sequence balance) for megatron
+- reduced ray-related serialization overhead
+- optimizer offloading, recomputation, and efficient kernels
+- various debugging metrics and utils
+- hybrid optimizer
+
+and the megatron backend now has a wider list of models supported:
+- DeepSeek-V3
+- Moonlight
+- Qwen3
+- Qwen2.5-VL (to be merged soon)
+- Qwen2
+- Mixtral
+
+## Getting Started
+
+### preparation
+The recommended image with pre-built Megatron dependency is `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.13.0-preview`, which is built using the Dockerfile at [docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview](https://github.com/volcengine/verl/blob/main/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview).
+
+The image is build in Hopper GPUs with DeepEP. It does not support None-Hopper GPUs, such as A100. You may need to reinstall DeepEP to work with A100.
+
+With `OFFLOAD_FRACTION=1`, the system's minimum requirements are lowered. It can run on as few as 96 H20 (96GB) GPUs for DeepSeek-V3, and on as few as 32 H20 (96GB) GPUs for Qwen3-235B-A22B. However, this configuration will use 1.6TB CPU memory per node. If you run out of CPU memory or require faster training speed, you can add more nodes.
+
+### DeepSeek 671b
+
+For DeepSeek-V3 671b, please refer to [examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh).
+
+MTP and quantilization is disabled during RL training.
+
+To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware.
+| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | LAST_LAYER |
+| -- | -- | -- | -- | -- | -- | -- | -- |
+| 96 | 12 | 8 | 12 | 8 | 1. | False | 6 |
+| 128 | 16 | 8 | 16 | 8 | 0.5 | True | 1 |
+| 256 | 32 | 8 | 16 | 8 | 0. | True | 1 |
+| 512 | 64 | 1 | 16 | 32 | 0 | True | 1 |
+
+### Qwen3 235b
+
+For Qwen3-235b, please refer to [examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh).
+
+To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware.
+| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | LAST_LAYER |
+| -- | -- | -- | -- | -- | -- | -- | -- |
+| 32 | 4 | 4 | 8 | 4 | 1. | False | 6 |
+| 64 | 8 | 4 | 8 | 4 | 0.5 | True | 6 |
+| 128 | 16 | 4 | 8 | 4 | 0 | True | 6 |
+| 256 | 32 | 4 | 8 | 4 | 0 | True | 6 |
+
+### Benchmark
+Here are some benchmark results for DeepSeek / Qwen3-235B. All configurations match the recommended settings based on the number of GPUs.
+
+| model | num gpus | mean response length | rollout time(s) | GPU memory(GB) | CPU memory(GB) | MFU | step time(s) |
+| -- | -- | -- | -- | -- | -- | -- | -- |
+| DeepSeek 671b | 96 | 1960 | 1050 | 66 | 1500 | 0.19 | 1700 |
+
+### Qwen3-30B-A3B MOE
+
+For Qwen3-30b, please refer to [examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh).
+
+To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware.
+| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | MFU |
+| -- | -- | -- | -- | -- | -- | -- | -- |
+| 8 | 1 | 1 | 1 | 8 | 1. | True | 0.4 |
+| 16 | 2 | 1 | 1 | 8 | 1. | True | 0.37 |
+| 32 | 4 | 1 | 1 | 8 | 1. | True | 0.31 |
+
+
+## Upcoming Optimizations
+
+The community continue to optimize large MoE models further, ongoing efforts include:
+- further optimizing memory consumption, and provide recommended/tuned configurations with various machine types
+- optimizing long context RL training performance
+- performance improvement with SGLang x Megatron
+
+We invite the community to try and improve verl together. Get connected with us on [slack](https://join.slack.com/t/verlgroup/shared_invite/zt-2w5p9o4c3-yy0x2Q56s_VlGLsJ93A6vA)/[wechat](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/WeChat.JPG)/[Github issues](https://github.com/volcengine/verl/issues/708)!
+
+## Acknowledgement
+@vermouth1992 @ISEEKYAN @ETOgaosion @yzlnew @ShareLer @BearBiscuit05 @ccclyu @ann-qin-lu @SwordFaith @zzong2006 @zhaochenyang20 @ocss884 @eric-haibin-lin @chenhaiq @techkang
diff --git a/code/RL_model/verl/verl_train/docs/perf/nsight_profiling.md b/code/RL_model/verl/verl_train/docs/perf/nsight_profiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..490de5e7e4f7b6ba6c0e372eb7c0c3bfce2a77b9
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/perf/nsight_profiling.md
@@ -0,0 +1,94 @@
+# NVIDIA Nsight Systems profiling in verl
+
+Last updated: 06/20/2025.
+
+This guide explains how to use NVIDIA Nsight Systems for profiling verl training runs.
+
+## Configuration
+
+Profiling in verl can be configured through several parameters in the trainer configuration file (ppo_trainer.yaml or other files like dapo_trainer.yaml):
+
+### Prerequisites
+
+Nsight Systems version is important, please reference `docker/Dockerfile.vllm.sglang.megatron` for the version we used.
+
+### Global profiling control
+
+verl has one single controller process and multiple worker processes. Both controller and worker processes can be profiled. Since the controller process can be executed in any nodes in the cluster, there is a message printed in the logging to indicate the controller process node hostname and process id.
+
+In `global_profiler`, three new config entries control the profiler behaviors:
+
+* **`global_profiler.steps`**. List of step numbers at which profiling should be performed. For example: [1, 2, 5] will profile steps 1, 2, and 5. And ``null`` means no profiling.
+
+* **`global_profiler.profile_continuous_steps`**. If true, and the following `global_profiler.discrete==False`, then the continuous steps in `global_profiler.steps` will be combined into one database. For example the above step 1 and 2 are in one database, and 5 in another. If false, every step occupies at least one database. The reason for this config is to observe the program behaviors between steps.
+
+Nsys options in controller nodes and worker nodes are configured in `global_profiler.global_tool_config.nsys`:
+
+* **`global_profiler.global_tool_config.nsys.controller_nsight_options`**. This config group is for the single controller. All fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. `ppo_trainer.yaml` provides a workable example. Users can reference [Nsight Systems manual](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) and [Ray user guide](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html) for more details.
+* **`global_profiler.global_tool_config.nsys.worker_nsight_options`**. This config group is for the worker processes. Similarly all fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. Capture range is used to control the profiler when to start and stop. So `capture-range: "cudaProfilerApi"` is fixed and does not change it. Users can change `capture-range-end` with some accurate calculation or just leave it `null`.
+
+### Worker process profiling
+
+Verl manages mulitiple RL roles, _Actor_, _Ref_, _Rollout_, _Critic_, _Reward_, which are implemented in different Worker classes. And these workers can be combined into one Ray Actor, running in a process group. Each RL role has its own profiling config group, `profiler`, which consists of three fields:
+
+* **`all_ranks` and `ranks`**. When `all_ranks` is set `True` then all ranks will be profiled; when set `False`, `ranks` will be profiled. By default, verl profiles the whole training process in a series ` worker_process_..nsys-rep` files for each process rank. PID is the process ID; RID is the capture range ID.
+* **`discrete`**. When set `False`, all the roles actions in one training step will be dumped in one database. When set `True`, the actions annotated by `DistProfiler.annotate` will be dumped into a discrete database. In this case, each role's action occupies one ``.
+* **Verl collocate mode**. Verl can combine two Worker sub classes to one Worker Actor. In this case, the user should take care that the combined Workers have consistent `discrete`. The Nsight Systems profiler uses a `torch.cuda.profiler.start()` and `stop()` pair to dump a `` database anyway.
+
+### where to find the profiling data
+
+By default the `*.nsys-rep` files are saved in the directory `/tmp/ray/session_latest/logs/nsight/` at each node. According to the Ray manual, this default directory is not changeable. ["however, Ray preserves the `--output` option of the default config"](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html).
+
+Some users may think it is not convenient, but it is understandable that Ray may start hundreds of processes and it would be a big network file system pressure if we save the files in one central place.
+
+## Usage Example
+
+To enable profiling for specific components and steps, modify your ppo_trainer.yaml like this:
+
+### Disable profiler
+
+```yaml
+ profiler:
+ steps: null # disable profile
+```
+
+### Enable profiler and one database for one training step
+
+```yaml
+ global_profiler:
+ steps: [1, 2, 5]
+ discrete: False
+ actor_rollout_ref:
+ actor:
+ profiler:
+ enable: True
+ all_ranks: True
+ # rollout & ref follow actor settings
+ critic:
+ profiler:
+ enable: True
+ all_ranks: True
+ reward_model:
+ profiler:
+ enable: True
+ all_ranks: True
+```
+
+### Enable profiler and multiple databases for one training step
+
+```yaml
+ profiler:
+ steps: [1, 2, 5]
+ discrete: True
+```
+
+## Profiling Output
+
+When profiling is enabled, verl will generate Nsight Systems profiles for the specified components and steps. The profiles will include:
+
+- CUDA kernel execution
+- Memory operations
+- CPU-GPU synchronization
+- NVTX markers for key operations
+
+Nsight Systems supports multi-report view, to open multiple databases together. In this mode, different processes and steps can be aligned in one time line for better analysis.
diff --git a/code/RL_model/verl/verl_train/docs/perf/perf_tuning.rst b/code/RL_model/verl/verl_train/docs/perf/perf_tuning.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b5edd50c4dfc88afdf18f2525c44fb882dc96eaf
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/perf/perf_tuning.rst
@@ -0,0 +1,224 @@
+Performance Tuning Guide
+==============================
+
+Last updated: 07/17/2025.
+
+Author: `Guangming Sheng `_, `Jiali Zheng `_
+
+In this section, we will discuss how to tune the performance of all the stages in verl, including:
+
+1. Rollout generation throughput.
+
+2. Enable ``use_remove_padding=True`` for sequence packing (i.e., data packing and remove padding).
+
+3. Batch size tuning for forward and backward computation
+
+4. Enable ``use_dynamic_bsz=True`` for higher throughput.
+
+5. Utilize Ulysses Sequence Parallel for Long Context Training
+
+6. LigerKernel for SFT performance optimization
+
+7. Forward prefetch in FSDP training backend
+
+8. Memory optimization for entropy calculation from logits
+
+Rollout Generation Tuning
+--------------------------
+
+verl currently supports two rollout backends: vLLM and TGI (with SGLang support coming soon).
+
+Below are key factors for tuning vLLM-based rollout. Before tuning, we recommend setting ``actor_rollout_ref.rollout.disable_log_stats=False`` so that rollout statistics are logged.
+
+- Increase ``gpu_memory_utilization``.
+
+ - For vLLM v0.7.0 and later, the vLLM instance will only use gpu_memory_utilization of the **total** memory.
+ - For SGLang, it's the fraction of the free GPU memory used for **static** memory like model weights and KV cache. However, the remaining (1-gpu_memory_utilization) will also be used during inference.
+
+ However, if model parameters and optimizer states are not offloaded, using too high a fraction can lead to OOM.
+ A value between 0.5 and 0.7 often strikes a good balance between high throughput and avoiding OOM.
+
+ Note: since the definition of ``gpu_memory_utilization`` varies across inference engines, a value that works well for one engine may cause OOM for another.
+
+- Adjust ``max_num_seqs`` or ``max_num_batched_tokens``.
+ If the GPU cache utilization is relatively low in the log, increase ``max_num_seqs`` or ``max_num_batched_tokens``
+ can enlarge the effective batch size in the decoding stage, allowing more concurrent requests per batch.
+ We recommend setting ``max_num_batched_tokens > 2048`` for higher throughput.
+
+- Use a smaller ``tensor_parallel_size``.
+ When GPU resources allow, a smaller tensor parallel size spawns more vLLM replicas.
+ Data parallelism (DP) can yield higher throughput than tensor parallelism (TP), but also increases KVCache consumption.
+ Carefully balance the trade-off between more replicas and higher memory usage.
+ Our experiment in Sec. 8.4 of `HybridFlow paper `_ evaluate this trade-off.
+
+- Balance performance and memory using ``cudagraph_capture_sizes``.
+ If ``cudagraph_capture_sizes`` is set, vLLM will try to capture the model execution graph for different batch sizes.
+ Since cudagraph memory can not be offloaded to cpu, The memory stay in gpu when update actor is running.
+ Using smaller batch sizes can avoid OOM but slightly reduce throughput.
+ Must to set ``enforce_eager=False`` to use ``cudagraph_capture_sizes``.
+
+More tuning details such as dealing with Preemption and Chunked-prefill
+can be found in `vLLM official tuning guide `_
+
+For optimal performance, we recommend using vLLM v0.8.3 or later. See https://github.com/volcengine/verl/blob/main/docs/README_vllm0.8.md for details.
+
+Enable remove padding (sequence packing)
+-----------------------------------------
+
+Currently, for llama, mistral, gemma1 and qwen based models, users can enable `use_remove_padding=True` to utilize the
+sequence packing implementation provided by transformers library.
+
+For other models, transformers library may also support it but we haven't tested it yet.
+Users can add the desired model config to the `test_transformer.py `_ file.
+And test its functionality by running the following command:
+
+.. code-block:: bash
+
+ pytest -s tests/models/test_transformer.py
+
+If the test passes, you can add your desired model into the model `registry.py `_ file.
+Then, you can enjoy the performance boost of sequence packing
+and welcome to PR your tested model to verl!
+
+
+Batch Size Tuning
+-----------------
+
+To achieve higher throughput in experience preparation (i.e., model fwd) and model update (i.e., actor/critic fwd/bwd),
+users may need to tune the ``*micro_batch_size_per_gpu`` for different computation.
+
+In verl, the core principle for setting batch sizes is:
+
+- **Algorithmic metrics** (train batch size, PPO mini-batch size) are *global* (from a single-controller perspective),
+ normalized in each worker. See the `normalization code `_.
+
+- **Performance-related parameters** (micro batch size, max token length for dynamic batch size) are *local* parameters that define the per-GPU data allocations.
+ See the `normalization code `_.
+
+.. note:: In your training script, please use ``*micro_batch_size_per_gpu`` instead of ``*micro_batch_size``.
+ So that you don't need to consider the normalization of the ``micro_batch_size`` and ``micro_batch_size`` will be deprecated.
+
+Batch Size Tuning tips
+""""""""""""""""""""""
+
+Therefore, users may need to tune the ``*micro_batch_size_per_gpu`` to accelerate training. Here're some tips:
+
+1. **Enable gradient checkpointing**:
+ Set ``actor_rollout_ref.model.enable_gradient_checkpointing=True`` and ``critic.model.enable_gradient_checkpointing=True``.
+ This often allows for larger micro-batch sizes and will be beneficial for large mini-batch training.
+
+2. Increase the ``*micro_batch_size_per_gpu`` as much as possible till equals to normalized ``mini_batch_size``.
+
+3. **Use larger forward-only parameters**:
+ Forward only parameter, such as ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``,
+ ``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``, ``critic.forward_micro_batch_size_per_gpu`` could be larger (e.g., 2x) than training related micro batch sizes,
+ such as ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``, ``critic.ppo_micro_batch_size_per_gpu``.
+
+4. **Allow larger micro-batch sizes for Critic and Reward models**:
+ micro batch size of Critic and Reward model could be larger than Actor model. This is because the actor model has much larger vocab size in the final layer.
+
+5. **Enable activation offloading**:
+ Set ``actor_rollout_ref.model.enable_activation_offload=True`` and ``critic.model.enable_activation_offload=True``.
+ This often works together with gradient checkpointing to get larger micro-batch sizes and it's only available in FSDP backend now.
+
+Tuning for Dynamic Batch Size
+-----------------------------
+
+Dynamic batch size is a technique that allows the model to process similar number of tokens in a single forward pass (with different actual batch sizes).
+This can significantly improve the training efficiency and reduce the memory usage.
+
+To utilize this technique, users can set ``use_dynamic_bsz=True`` in actor, ref, critic and reward models.
+With ``use_dynamic_bsz=True``, users don't need to tune ``*micro_batch_size_per_gpu``.
+Instead, users should tune the following parameters:
+
+- ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``, ``critic.ppo_max_token_len_per_gpu``:
+ The maximum number of tokens to be processed in fwd and bwd of ``update_policy`` and ``update_critic``.
+
+- ``actor_rollout_ref.ref.log_prob_max_token_len_per_gpu`` and ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``:
+ The maximum number of tokens to be processed in a the fwd computation of ``compute_log_prob`` and ``compute_ref_log_prob``.
+
+- ``critic.forward_micro_batch_size_per_gpu``, ``reward_model.forward_micro_batch_size_per_gpu``:
+ The maximum number of tokens to be processed in a the fwd computation of ``compute_values``, ``compute_rm_score``.
+
+Dynamic Batch Size Tuning tips
+""""""""""""""""""""""""""""""
+
+Here're some tips to tune the above parameters:
+
+1. **Increase** ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``
+ Make it at least 2 x (max_prompt_length + max_response_length). We set it to 3x in `run_qwen2-7b_rm_seq_balance.sh `_.
+ Try to increase it to get higher throughput.
+
+2. **Forward-only parameters can be larger**:
+ Similar to the non-dynamic-batch scenario, forward-only token limits can exceed those used in forward/backward operations.
+
+3. **Use larger limits for Critic and Reward models**:
+ Critic and Reward parameters can be set at least 2× the Actor’s limits. For instance, we set them to 4× here:
+ `run_qwen2-7b_rm_seq_balance.sh `_
+
+.. :math:`\text{critic.ppo_max_token_len_per_gpu} = 2 \times \text{actor.ppo_max_token_len_per_gpu})`.
+
+Ulysses Sequence Parallel for Long Context Training
+----------------------------------------------------
+
+To utilize this technique, users can set ``ulysses_sequence_parallel_size>1`` in actor, ref, critic and reward models.
+
+We support different model utilize different ulysses_sequence_parallel_size sizes.
+
+To train long sequence (>32k), users may need to decrease the ``*micro_batch_size_per_gpu`` and ``*max_token_len_per_gpu`` to avoid OOM.
+
+LigerKernel for SFT
+----------------------
+
+LigerKernel is a high-performance kernel for Supervised Fine-Tuning (SFT) that can improve training efficiency. To enable LigerKernel in your SFT training:
+
+1. Install liger-kernel via ``pip3 install liger-kernel``. In your SFT configuration file (e.g., ``verl/trainer/config/sft_trainer.yaml``), set the ``use_liger`` parameter:
+
+ .. code-block:: yaml
+
+ model:
+ use_liger: True # Enable LigerKernel for SFT
+
+2. The default value is ``False``. Enable it only when you want to use LigerKernel's optimizations.
+
+3. LigerKernel is particularly useful for improving training performance in SFT scenarios.
+
+Forward prefetch in FSDP training backend
+----------------------
+
+During the training phase, users can enable forward prefetching in FSDP by setting ``fsdp_config.forward_prefetch=True``. For example, ``actor_rollout_ref.actor.fsdp_config.forward_prefetch=True``. This configuration prefetches the next forward-pass all-gather operation before completing the current forward computation, overlapping communication with computation and improving efficiency. For further details, refer to the `FSDP forward_prefetch `_ documentation.
+
+.. note::
+ Backward prefetch is unsupported because the ``BACKWARD_POST`` policy may prefetch incorrectly in nested-module cases. For details, see the `FSDP documentation `_
+
+Migrating to FSDP2
+----------------------
+
+FSDP2 offers notable improvements over FSDP1. According to `PyTorch TorchTitan benchmarks `_:
+
+- 7% lower GPU memory usage on average
+- 1.5% throughput improvement with BF16 training
+- Better composability with DTensor and per-parameter sharding
+
+**Enabling FSDP2 in VERL:**
+
+ .. code-block:: python
+
+ # Enable FSDP2 in actor configuration
+ actor_rollout_ref.actor.strategy="fsdp2"
+
+.. note::
+ FSDP2 requires PyTorch 2.1+ and is recommended for models with transformer architecture.
+
+Memory optimization for entropy calculation from logits
+----------------------
+
+The ``logits`` tensor (typically of shape ``[bsz*seq_len, voc]``) can consume significant memory. When using ``compute_entropy_from_logits``, memory usage reaches approximately ``[bsz*seq_len, voc] × (4 bytes (float32) + 2 bytes (autocast for softmax+logsumexp) + 1 byte (softmax output))``.
+
+To reduce this memory peak, enable chunked computation by setting:
+``actor_rollout_ref.ref.entropy_from_logits_with_chunking = True``
+This processes the tensor in chunks of shape ``[chunk_size, voc]`` (e.g., 2048) rather than the full sequence length, exclusively during the model's forward pass.
+
+Additionally, during training, standard gradient checkpointing (``enable_gradient_checkpointing=True``) does not apply to entropy calculations. To reduce memory peaks in this context, set:
+``actor_rollout_ref.actor.entropy_checkpointing = True``
+This enables entropy recomputation specifically for the entropy calculation, lowering memory usage during training.
diff --git a/code/RL_model/verl/verl_train/docs/perf/torch_profiling.md b/code/RL_model/verl/verl_train/docs/perf/torch_profiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c2b67ea84881e2a5249f5b8f435d0cf80747289
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/perf/torch_profiling.md
@@ -0,0 +1,117 @@
+# PyTorch Profiling in verl
+
+Last updated: 01/13/2026.
+
+This guide explains how to use the native [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) for profiling verl training runs.
+
+## Configuration
+
+Profiling in verl can be configured through parameters in the trainer configuration file (e.g., `ppo_trainer.yaml`).
+
+### Global Profiling Control
+
+In `global_profiler`, you can control when and how profiling occurs globally:
+
+* **`global_profiler.steps`**: List of step numbers to profile. E.g., `[1, 2, 5]` profiles steps 1, 2, and 5. Set to `null` to disable.
+* **`global_profiler.save_path`**: Directory to save the profiling results. Default is `outputs/profile`.
+
+### Role Profiling Control
+
+Each RL role (Actor, Critic, etc.) has its own `profiler` configuration:
+
+* **`enable`**: Whether to enable profiling for this role.
+* **`all_ranks`**: If `True`, profiles all ranks.
+* **`ranks`**: List of specific ranks to profile if `all_ranks` is `False`.
+* **`tool_config.torch`**: Configuration specific to the PyTorch Profiler.
+
+#### PyTorch Profiler Options (`tool_config.torch`)
+
+You can customize the PyTorch Profiler behavior using the following fields under `tool_config.torch`:
+
+* **`contents`**: List of contents to profile.
+ * **`cpu`**: Profile CPU activities.
+ * **`cuda`**: Profile CUDA activities.
+ * **`memory`**: Track tensor memory allocation/free.
+ * **`shapes`**: Record shapes of operator inputs.
+ * **`stack`**: Record source code file and line number.
+* **`schedule`**: (Advanced) configuration for `wait`, `warmup`, `active`, `repeat` cycles.
+
+## Examples
+
+### 1. End-to-End Collection
+
+Collects performance data for all steps in a single trace file.
+
+```yaml
+global_profiler:
+ steps: [1, 2, 5]
+ save_path: ./outputs/profile
+
+actor_rollout_ref:
+ actor:
+ profiler:
+ enable: True
+ all_ranks: True
+ tool_config:
+ torch:
+ discrete: False
+ contents: [cpu, cuda]
+ # rollout & ref follow actor settings
+```
+
+### 2. Discrete Mode Collection
+
+Discrete mode saves separate trace files for each step. This is useful for detailed analysis and is **mandatory** when using Agent Loop.
+
+**Configuration Example**
+
+This configuration supports profiling both Training (Actor) and Inference (Rollout). You can enable/disable them independently.
+
+```yaml
+actor_rollout_ref:
+ actor:
+ profiler:
+ enable: True # Set to True to profile training
+ all_ranks: False
+ ranks: [0] # Global Rank 0
+ tool_config:
+ torch:
+ discrete: True
+ contents: [cpu, cuda]
+ rollout:
+ profiler:
+ enable: True # Set to True to profile inference
+ all_ranks: False
+ ranks: [0] # In Agent Loop, this is the Replica Rank (e.g. 0-th instance)
+ tool_config:
+ torch:
+ discrete: True # REQUIRED
+ # ref follow actor settings
+```
+
+> **Note for Agent Loop Mode**:
+> When using Agent Loop, `ranks` in rollout config refers to the **Replica Rank** (instance index), not the global rank.
+
+**Inference Backend Setup (for Agent Loop)**
+
+* **vLLM Engine**:
+ * **Environment Variables Required**:
+ * `VLLM_TORCH_PROFILER_DIR`: **(Required)** Directory to save traces (e.g., `/mnt/traces`).
+ * `VLLM_TORCH_PROFILER_WITH_STACK`: `1` to enable stack tracing (default).
+ * `VLLM_TORCH_PROFILER_RECORD_SHAPES`: `1` to record shapes of operator inputs.
+ * `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY`: `1` to track tensor memory allocation/free.
+ * `VLLM_TORCH_PROFILER_WITH_FLOPS`: `1` to estimate FLOPS.
+ * *Note: vLLM ignores the `save_path` and `contents` in `ppo_trainer.yaml`.*
+
+* **SGLang Engine**:
+ * **Zero Configuration**: Automatically uses the settings from `ppo_trainer.yaml`.
+
+## Visualization
+
+Collected trace files (usually `.json` or `.json.gz`) are stored in the configured `save_path`.
+
+You can visualize them using:
+
+1. **Chrome Tracing**: Open `chrome://tracing` in a Chrome browser and load the JSON file.
+2. **Perfetto**: Open [ui.perfetto.dev](https://ui.perfetto.dev/) and load the file (recommended for large traces).
+3. **TensorBoard**: If using the TensorBoard plugin for PyTorch Profiler.
diff --git a/code/RL_model/verl/verl_train/docs/perf/verl_profiler_system.md b/code/RL_model/verl/verl_train/docs/perf/verl_profiler_system.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc7ecc38eed92ca5e05274e23f40b6f1ce7033b0
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/perf/verl_profiler_system.md
@@ -0,0 +1,36 @@
+# verl Profiler System
+
+Last updated: 08/18/2025.
+
+## Architecture
+
+The architecture of verl profiler system is like below:
+
+
+
+There is a global profiler and tool configuration to set some common config in single controller level, deciding
+
+- `tool`: which tool to use
+- `steps`: which steps to profile
+- `save_path`: results saving path
+
+When some tool need to profile behavior of each role, configurations in role-level is needed:
+
+- `tool`: which tool to use
+- `enable`: whether enable profiling on this role
+- rank info: `all_ranks` and `rank` to decide which rank to profile or log output
+
+For tool config in role-level, there are some detailed behavior needed to control, like the `discrete` mode in nsys profiler.
+
+Every role has a profiler config, and by default, rollout/ref/reward models follow the Actor's behavior.
+
+## To Add a new profiling tool
+
+New added profiling tool shall reuse the current APIs as much as possible.
+
+1. The logic of **whether to use the tool**: `tool == [new tool]`.
+2. Add the global and local tool config to `ppo_trainer.yaml`/`ppo_megatron_trainer.yaml` and each `[role].yaml`, under `global_tool_config.[new tool]` and `tool_config.[new tool]`
+3. The tool config should be implemented in `verl/utils/profiler/config.py`, inherit the `BaseConfig` class.
+4. Implement profiling tool initialization logic using configurations in `global_profiler.global_tool_config.[new tool]` and the results saving logics (can also save in role-level profile)
+5. For role function-level profiling, please follow the nsys profiler way in `nvtx_profiler.py`, implement a profiler class inherit `DistProfiler` and import new profiler in `verl/utils/profiler/__init__.py`
+6. Add unit test and examples for others to use in convinience.
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/preparation/prepare_data.rst b/code/RL_model/verl/verl_train/docs/preparation/prepare_data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c429e4b167967652a0c3fb52d9e0029f1b9899d4
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/preparation/prepare_data.rst
@@ -0,0 +1,128 @@
+Prepare Data for Post-Training
+========================================
+
+Last updated: 02/09/2025.
+
+Before starting the post-training job, we need to prepare the data for
+the policy training. The data should be stored in the parquet format.
+
+We provide several data preprocess scripts for different datasets,
+including GSM8K, MATH, HelloSwag, Full_hh_rlhf. To prepare other datasets, we need
+to follow the following steps: The data preprocess script can be divided
+into two parts:
+
+1. The first part is the common part, which loads the dataset from
+ huggingface's ``datasets`` package. Then preprocess the datasets with
+ the ``make_map_fn`` and then store in the parquet format.
+
+.. code:: python
+
+ import re
+ import os
+ import datasets
+
+ from verl.utils.hdfs_io import copy, makedirs
+ import argparse
+
+ # To extract the solution for each prompts in the dataset
+ # def extract_solution(solution_str):
+ # ...
+
+
+ if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--local_dir', default='/opt/tiger/gsm8k')
+ parser.add_argument('--hdfs_dir', default=None)
+
+ args = parser.parse_args()
+
+ num_few_shot = 5
+ data_source = 'openai/gsm8k'
+
+ dataset = datasets.load_dataset(data_source, 'main')
+
+ train_dataset = dataset['train']
+ test_dataset = dataset['test']
+
+ # Construct a `def make_map_fn(split)` for the corresponding datasets.
+ # ...
+
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
+
+ local_dir = args.local_dir
+ hdfs_dir = args.hdfs_dir
+
+ train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
+ test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
+
+ makedirs(hdfs_dir)
+
+ copy(src=local_dir, dst=hdfs_dir)
+
+2. The users are required to implement the ``make_map_fn()`` function
+ (as well as the ``extract_solution``) on their own to support
+ different datasets or tasks.
+
+We already implemented the data preprocess of GSM8k, MATH, Hellaswag and Full_hh_rlhf
+datasets. And we take the GSM8k dataset as an example:
+
+**GSM8K**
+
+In the ``make_map_fn``, each data field should consist of the following
+5 fields:
+
+1. ``data_source``: The name of the dataset. To index the corresponding
+ reward function in the ``RewardModel``
+2. ``prompt``: This field should be constructed in the format of
+ huggingface chat_template. The tokenizer in ``RLHFDataset`` will
+ apply chat template and tokenize the prompt.
+3. ``ability``: Define the task category.
+4. ``reward_model``: Currently, we only utilize the ``ground_truth``
+ field during evaluation. The ``ground_truth`` is computed by the
+ ``extract_solution`` function. **NOTED** that the implementation of
+ the corresponding reward function should align with this extracted
+ ``ground_truth``.
+5. ``extra_info``: Record some information of the current prompt. Not
+ use for now.
+
+.. code:: python
+
+ def extract_solution(solution_str):
+ solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) # extract the solution after ####
+ assert solution is not None
+ final_solution = solution.group(0)
+ final_solution = final_solution.split('#### ')[1].replace(',', '')
+ return final_solution
+
+ instruction_following = "Let's think step by step and output the final answer after \"####\"."
+
+ # add a row to each data item that represents a unique id
+ def make_map_fn(split):
+
+ def process_fn(example, idx):
+ question = example.pop('question')
+
+ question = question + ' ' + instruction_following
+
+ answer = example.pop('answer')
+ solution = extract_solution(answer)
+ data = {
+ "data_source": data_source,
+ "prompt": [{
+ "role": "user",
+ "content": question
+ }],
+ "ability": "math",
+ "reward_model": {
+ "style": "rule",
+ "ground_truth": solution
+ },
+ "extra_info": {
+ 'split': split,
+ 'index': idx
+ }
+ }
+ return data
+
+ return process_fn
diff --git a/code/RL_model/verl/verl_train/docs/preparation/reward_function.rst b/code/RL_model/verl/verl_train/docs/preparation/reward_function.rst
new file mode 100644
index 0000000000000000000000000000000000000000..286e2aff49fea71e34ac706d509725cc94aece13
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/preparation/reward_function.rst
@@ -0,0 +1,71 @@
+Implement Reward Function for Dataset
+======================================
+
+Last updated: 06/02/2025.
+
+For each dataset, we need to implement a reward function or utilize a reward model to compute the rewards for the generated responses.
+We already pre-implemented some reward functions in `reward_score directory `_.
+You can also use customized reward functions.
+
+Currently, we support reward functions for GSM8k and MATH datasets. For RLHF datasets (e.g.,
+full_hh_rlhf) and Code Generation (e.g., APPS), we utilize reward model
+and SandBox (will opensource soon) for evaluation respectively.
+
+RewardManager
+-------------
+
+In the entrypoint of the PPO Post-Training script `main_ppo.py `_,
+we implement a ``RewardManager`` that utilize pre-implemented reward functions to compute the scores for each response.
+
+In the ``RewardManager``, we implemented a ``__call__`` function to
+compute the score for each response.
+All the reward functions are executed by ``compute_score_fn``.
+The input is a ``DataProto``, which includes:
+
+- ``input_ids``, ``attention_mask``: ``input_ids`` and ``attention_mask`` after applying
+ chat_template, including prompt and response
+- ``responses``: response tokens
+- ``ground_truth``: The ground truth string of the current prompt.
+ Stored in ``non_tensor_batch`` in the ``DataProto``, which should be
+ preprocessed in the parquet files.
+- ``data_source``: The dataset name of the current prompt. Stored in
+ ``non_tensor_batch`` in the ``DataProto``, which should be
+ preprocessed in the parquet files.
+
+After detokenize the responses, the responses string and the ground
+truth string will be input to the ``compute_score_fn`` to compute the
+score for each response.
+
+Reward Functions
+----------------
+
+Pre-implemented
+~~~~~~~~~~~~~~~
+
+We already pre-implemented some reward functions in `reward_score directory `_.
+
+- In the `GSM8k example `_, we
+ force the response to output the final answer after four ####, then
+ use string matching to compare with the ground truth. If completely
+ correct, score 1 point; if the format is correct, score 0.1 points; if
+ the format is incorrect, score 0 points.
+- In the `MATH example `_, we follow
+ the implementation in `lm-evaluation-harness repository `_.
+
+Customized
+~~~~~~~~~~
+
+You can implement customized reward functions in a separate file and specify them using ``custom_reward_function.path`` and ``custom_reward_function.name``. For the set of them, please refer to :ref:`config-explain-page`.
+
+The parameters of your reward function should be ``data_source``, ``solution_str``, ``ground_truth``, and ``extra_info``.
+For example:
+
+.. code:: python
+
+ def my_reward_fn(data_source, solution_str, ground_truth, extra_info=None):
+ return len(solution_str)/100
+
+If you are testing only a single customized reward function, you can simply name it 'compute_score' and leave ``custom_reward_function.name`` unset.
+
+To run multiple tests with different customized reward functions, you can modify both ``custom_reward_function.path`` and ``custom_reward_function.name`` for each trial.
+For instance, you might create a single `my_reward.py` file and implement multiple reward functions within it. This way, for different trials, you only need to adjust ``custom_reward_function.name``, making it more convenient to conduct multiple tests within scripts.
diff --git a/code/RL_model/verl/verl_train/docs/requirements-docs.txt b/code/RL_model/verl/verl_train/docs/requirements-docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..55ccdb8f7149bd6b774b592dca068e63e87256db
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/requirements-docs.txt
@@ -0,0 +1,13 @@
+# markdown support
+recommonmark
+myst_parser
+# markdown table support
+sphinx-markdown-tables
+
+# theme default rtd
+
+# crate-docs-theme
+sphinx-rtd-theme
+
+# pin tokenizers version to avoid env_logger version req
+tokenizers==0.21
diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/interaction_system.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/interaction_system.rst
new file mode 100644
index 0000000000000000000000000000000000000000..812a9484eb264d79500bd0aba9607d43146bd01c
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/interaction_system.rst
@@ -0,0 +1,417 @@
+Interaction System for Multi-turn RL Training
+=============================================
+
+Last updated: 06/25/2025.
+
+Overview
+--------
+
+The verl interaction system enables dynamic, multi-turn conversational feedback during reinforcement learning training. This system allows models to engage in iterative problem-solving scenarios where interaction agents can provide corrective feedback, guidance, or evaluation based on the model's responses.
+
+**New in Multi-Interaction Support**: The system now supports multiple named interactions within a single training session, enabling sophisticated training scenarios where different samples can use different interaction strategies. This allows for curriculum learning, domain-specific feedback, and flexible agent switching at the sample level.
+
+Key features:
+
+- **Async-based Architecture**: Non-blocking interaction processing for distributed training
+- **Instance Management**: Stateful session handling with unique instance IDs for concurrent interactions
+- **SGLang Integration**: Seamless integration with SGLang rollout system for multi-turn conversations
+- **Configuration-driven**: Dynamic agent loading via YAML configuration files
+- **Multi-Interaction Support**: Registry system enabling multiple named interactions per rollout
+- **Sample-Level Selection**: Each sample can specify which interaction to use via configuration
+- **Reward Integration**: Turn-level scoring mechanism integrated with verl's reward system
+
+Architecture
+------------
+
+The interaction system follows a plugin-based architecture with clear separation of concerns:
+
+.. code-block::
+
+ Interaction Registry System
+ ↓
+ BaseInteraction (Abstract Interface)
+ ↓
+ Multiple Named Interactions (e.g., Gsm8kInteraction, CustomInteraction)
+ ↓
+ SGLang Rollout Integration (interaction_map)
+ ↓
+ Sample-Level Interaction Selection
+ ↓
+ Async Request Lifecycle Management
+
+Core Components
+~~~~~~~~~~~~~~~
+
+**Interaction Registry System**
+
+The interaction registry system allows loading and managing multiple named interactions:
+
+.. code-block:: python
+
+ from verl.interactions.utils.interaction_registry import initialize_interactions_from_config
+
+ # Load multiple interactions from config
+ interaction_map = initialize_interactions_from_config("config.yaml")
+
+ # Access specific interaction by name
+ gsm8k_interaction = interaction_map["gsm8k"]
+ custom_interaction = interaction_map["custom_solver"]
+
+**BaseInteraction Interface**
+
+All interaction agents must implement the ``BaseInteraction`` abstract class:
+
+.. code-block:: python
+
+ from verl.interactions.base import BaseInteraction
+ from typing import Dict, Any, List, Tuple, Optional
+
+ class BaseInteraction:
+ def __init__(self, config: Dict[str, Any]):
+ self.config = config
+ self.name: str = config.get("name", "interaction_agent")
+
+ async def start_interaction(self, instance_id: Optional[str] = None, **kwargs) -> str:
+ """Initialize interaction session, return instance_id"""
+
+ async def generate_response(self, instance_id: str, messages: List[Dict[str, Any]], **kwargs) -> Tuple[bool, str, float, Dict[str, Any]]:
+ """Generate response, return (should_terminate, response, score, metadata)"""
+
+ async def calculate_score(self, instance_id: str, **kwargs) -> float:
+ """Calculate turn-level score for RL training"""
+
+ async def finalize_interaction(self, instance_id: str, **kwargs) -> None:
+ """Clean up resources"""
+
+**Request Lifecycle**
+
+The interaction system integrates with SGLang's async rollout via state management:
+
+1. ``PENDING`` → Initialize interaction via ``start_interaction()``
+2. ``GENERATING`` → Model generates response
+3. ``INTERACTING`` → Process response via ``generate_response()``
+4. ``GENERATING`` → Continue if not terminated, otherwise ``COMPLETED``
+
+Configuration
+-------------
+
+**Basic Setup**
+
+Enable interaction in your rollout configuration:
+
+.. code-block:: yaml
+
+ actor_rollout_ref:
+ rollout:
+ multi_turn:
+ enable: true
+ interaction_config_path: "path/to/interaction_config.yaml"
+ max_user_turns: 10
+ max_assistant_turns: 10
+
+**Interaction Configuration File**
+
+Create an interaction configuration file (e.g., ``interaction_config.yaml``):
+
+**Single Interaction (Legacy Format)**
+
+.. code-block:: yaml
+
+ interaction:
+ - name: "gsm8k"
+ class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction"
+ config: {}
+
+**Multiple Interactions (New Format)**
+
+.. code-block:: yaml
+
+ interaction:
+ - name: "gsm8k"
+ class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction"
+ config: {}
+ - name: "custom_solver"
+ class_name: "custom.interactions.CustomInteraction"
+ config:
+ solver_type: "advanced"
+ timeout: 30
+ - name: "code_verifier"
+ class_name: "verl.interactions.base.BaseInteraction"
+ config:
+ verification_mode: "strict"
+
+**Automatic Name Generation**
+
+If no ``name`` field is provided, the system will automatically generate one from the class name:
+
+.. code-block:: yaml
+
+ interaction:
+ - class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction"
+ config: {}
+ # Automatically generates name: "gsm8k"
+
+The system will dynamically load all specified interaction classes and make them available by name.
+
+Implementation Example: GSM8K
+-----------------------------
+
+The GSM8K interaction demonstrates a complete implementation for math problem-solving scenarios:
+
+.. code-block:: python
+
+ from verl.interactions.base import BaseInteraction
+ from verl.utils.reward_score import gsm8k
+ from uuid import uuid4
+
+ class Gsm8kInteraction(BaseInteraction):
+ def __init__(self, config: dict):
+ super().__init__(config)
+ self._instance_dict = {}
+
+ async def start_interaction(self, instance_id=None, ground_truth=None, **kwargs):
+ if instance_id is None:
+ instance_id = str(uuid4())
+ self._instance_dict[instance_id] = {
+ "response": "",
+ "ground_truth": ground_truth,
+ "reward": 0.0,
+ }
+ return instance_id
+
+ async def generate_response(self, instance_id, messages, **kwargs):
+ # Extract last assistant message content
+ content = ""
+ for item in reversed(messages):
+ if item.get("role") == "assistant":
+ content = item.get("content", "")
+ break
+
+ # Ensure GSM8K format (#### prefix)
+ self._instance_dict[instance_id]["response"] = content
+
+ reward = await self.calculate_score(instance_id)
+ if reward == 1.0:
+ return True, "Your response is correct!", 1.0, {}
+ else:
+ return False, "Your response is incorrect! You need to reflect on your answer and try again.", 0.0, {}
+
+ async def calculate_score(self, instance_id, **kwargs):
+ return gsm8k.compute_score(
+ self._instance_dict[instance_id]["response"],
+ self._instance_dict[instance_id]["ground_truth"],
+ method="strict", format_score=0.0, score=1.0,
+ )
+
+ async def finalize_interaction(self, instance_id, **kwargs):
+ del self._instance_dict[instance_id]
+
+Training Integration
+--------------------
+
+**Training Script Configuration**
+
+Include interaction configuration in your training command:
+
+.. code-block:: bash
+
+ python3 -m verl.trainer.main_ppo \\
+ --config-path="$CONFIG_PATH" \\
+ --config-name='gsm8k_multiturn_grpo_w_interaction' \\
+ algorithm.adv_estimator=grpo \\
+ data.train_batch_size=512 \\
+ data.return_raw_chat=True \\
+ actor_rollout_ref.rollout.name=sglang \\
+ actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \\
+ trainer.total_epochs=15
+
+**Data Requirements**
+
+Ensure your dataset includes interaction parameters with the ``name`` field for interaction selection:
+
+.. code-block:: python
+
+ # Dataset should include interaction_kwargs in non_tensor_batch
+ interaction_kwargs = [
+ {"name": "gsm8k", "query": "What is 2+2?", "ground_truth": "4"},
+ {"name": "custom_solver", "query": "Solve: x^2 + 5x + 6 = 0", "ground_truth": "x = -2, -3"},
+ {"name": "gsm8k", "query": "What is 3+3?", "ground_truth": "6"},
+ ]
+
+**Sample-Level Interaction Selection**
+
+Each sample can specify which interaction to use via the ``name`` field. This enables flexible training scenarios where different samples use different interaction strategies:
+
+.. code-block:: python
+
+ # Example: Math problems use GSM8K interaction, code problems use code verifier
+ data_samples = [
+ {
+ "prompt": "What is 15% of 200?",
+ "interaction_kwargs": {
+ "name": "gsm8k",
+ "query": "What is 15% of 200?",
+ "ground_truth": "30"
+ }
+ },
+ {
+ "prompt": "Write a function to check if a number is prime",
+ "interaction_kwargs": {
+ "name": "code_verifier",
+ "code_type": "python",
+ "expected_behavior": "return True for prime numbers"
+ }
+ }
+ ]
+
+**Backward Compatibility**
+
+If no ``name`` field is provided in ``interaction_kwargs``, the system defaults to ``"gsm8k"`` for backward compatibility.
+
+Best Practices
+--------------
+
+**Resource Management**
+
+- Always implement proper cleanup in ``finalize_interaction()``
+- Use unique instance IDs to avoid conflicts in concurrent training
+- Handle edge cases like empty messages or malformed content
+
+**Performance Optimization**
+
+- Keep interaction logic lightweight to avoid blocking training
+- Use async/await properly to maintain non-blocking behavior
+- Consider caching expensive computations within interaction instances
+
+**Testing**
+
+Comprehensive testing is essential for interaction systems:
+
+.. code-block:: python
+
+ import pytest
+ from unittest.mock import patch
+
+ @pytest.mark.asyncio
+ async def test_interaction_workflow():
+ interaction = YourInteraction({})
+
+ # Test complete workflow
+ instance_id = await interaction.start_interaction(ground_truth="expected_answer")
+
+
+ messages = [{"role": "user", "content": "user_content"}, {"role": "assistant", "content": "assistant_content"}]
+ should_terminate, response, reward, metadata = await interaction.generate_response(instance_id, messages)
+
+ assert should_terminate in [True, False]
+ assert isinstance(reward, float)
+
+ await interaction.finalize_interaction(instance_id)
+
+Advanced Usage
+--------------
+
+**Multi-Interaction Training Strategies**
+
+You can design sophisticated training scenarios using multiple interactions:
+
+.. code-block:: python
+
+ # Example: Progressive difficulty with different interaction agents
+ class MathTrainingPipeline:
+ def create_interaction_config(self):
+ return {
+ "interaction": [
+ {
+ "name": "basic_math",
+ "class_name": "verl.interactions.gsm8k_interaction.Gsm8kInteraction",
+ "config": {"difficulty": "easy"}
+ },
+ {
+ "name": "advanced_math",
+ "class_name": "custom.interactions.AdvancedMathInteraction",
+ "config": {"difficulty": "hard", "allow_hints": True}
+ },
+ {
+ "name": "competition_math",
+ "class_name": "custom.interactions.CompetitionMathInteraction",
+ "config": {"time_limit": 300, "show_steps": False}
+ }
+ ]
+ }
+
+ def create_curriculum_data(self, epoch):
+ if epoch < 5:
+ return [{"name": "basic_math", ...} for _ in samples]
+ elif epoch < 10:
+ return [{"name": "advanced_math", ...} for _ in samples]
+ else:
+ return [{"name": "competition_math", ...} for _ in samples]
+
+**Custom Scoring Functions**
+
+You can integrate custom reward functions:
+
+.. code-block:: python
+
+ async def calculate_score(self, instance_id, **kwargs):
+ response = self._instance_dict[instance_id]["response"]
+ ground_truth = self._instance_dict[instance_id]["ground_truth"]
+
+ # Custom evaluation logic
+ if custom_evaluation_function(response, ground_truth):
+ return 1.0
+ else:
+ return 0.0
+
+**Multi-step Interactions**
+
+For complex scenarios requiring multiple feedback rounds:
+
+.. code-block:: python
+
+ async def generate_response(self, instance_id, messages, **kwargs):
+ instance = self._instance_dict[instance_id]
+ instance["attempts"] += 1
+
+ # Evaluate current response
+ reward = await self.calculate_score(instance_id)
+
+ if reward > 0.8:
+ return True, "Excellent work!", reward, {}
+ elif instance["attempts"] < 3:
+ return False, "Good attempt, but try to improve...", reward, {}
+ else:
+ return True, "Maximum attempts reached.", reward, {}
+
+Troubleshooting
+---------------
+
+**Common Issues**
+
+1. **Instance ID Conflicts**: Ensure unique instance IDs across concurrent sessions
+2. **Memory Leaks**: Always call ``finalize_interaction()`` to clean up resources
+3. **Blocking Operations**: Keep interaction logic async and non-blocking
+4. **Configuration Errors**: Verify interaction config path and class name are correct
+5. **Interaction Name Conflicts**: Ensure all interactions have unique names in the configuration
+6. **Missing Interaction**: Verify the ``name`` field in ``interaction_kwargs`` matches available interactions
+7. **Backward Compatibility**: When migrating from single to multi-interaction, add ``name`` fields to existing data
+
+**Debugging**
+
+Enable debug logging to trace interaction flow:
+
+.. code-block:: bash
+
+ export VERL_LOGGING_LEVEL=DEBUG
+
+**Performance Monitoring**
+
+Monitor interaction performance impact on training throughput and adjust accordingly.
+
+Related Documentation
+--------------------
+
+- :doc:`multiturn`: Basic multi-turn rollout configuration
+- :doc:`sandbox_fusion`: Tool integration with SGLang
+- :doc:`search_tool_example`: Search tool implementation example
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/multiturn.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/multiturn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..54548316d14155434c937fb8c292cd4dec471b0c
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/multiturn.rst
@@ -0,0 +1,354 @@
+Multi-turn Rollout Support
+==========================
+
+Last updated: 06/27/2025.
+
+Basic Configuration
+~~~~~~~~~~~~~~~~~~~
+
+To enable multi-turn rollout, make sure to configure the following fields in your rollout configuration:
+
+.. code-block:: yaml
+
+ actor_rollout_ref:
+ rollout:
+ multi_turn: True
+ name: "sglang"
+
+These configuration activates the sglang engine for multi-turn interaction during rollout.
+
+Custom Tool Configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For custom environment interaction tools, you can implement your own tools based on ``verl.tools.base_tool.BaseTool``. Then, specify your tool configurations in a YAML file:
+
+.. code-block:: yaml
+
+ tools:
+ - class_name: ""
+ config:
+ type: native
+ tool_schema:
+
+You may refer to GSM8KTool_example_configuration_, which is one example of the tool configurations. Its implementation can be found in gsm8k_tool.py_.
+
+Finally, set the ``tools_config_file`` in your rollout config:
+
+.. code-block:: yaml
+
+ actor_rollout_ref:
+ rollout:
+ tool_kwargs:
+ tools_config_file:
+
+This allows integration of customized tool behaviors during actor rollout steps.
+
+If you want rollout with simulated interaction, you can set the ``interaction_config_file`` in your rollout config:
+
+.. code-block:: yaml
+
+ interaction:
+ - class_name: ""
+ config: {}
+
+.. code-block:: yaml
+
+ actor_rollout_ref:
+ rollout:
+ interaction_config_file:
+
+If your tool creates multi-modal inputs, you should return a list of multi-modal inputs in your tool.execute() implementation.
+
+Image and video should be processed before returning. For example, if you are using Qwen2.5-VL, you can use the following code to get the representations:
+
+.. code-block:: python
+
+ async def create(self, ...) -> tuple[str, ToolResponse]:
+ ...
+ from verl.utils.dataset.vision_utils import process_image, process_video
+
+ img1 = process_image(img1)
+ video1 = process_video(video1)
+
+ # due to the (image | video) key is ("image" | "video") instead of ("images" | "videos") in vllm, we need to use ("image" | "video") to specify list of images/videos
+ # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
+ return instance_id, ToolResponse(image=[img1, ...], video=[video1, ...], text="...")
+
+ async def execute(self, ...) -> Tuple[str | Dict[str, Any], float, dict]:
+ ...
+ from verl.utils.dataset.vision_utils import process_image, process_video
+
+ img1 = process_image(img1)
+ video1 = process_video(video1)
+
+ # due to the (image | video) key is ("image" | "video") instead of ("images" | "videos") in vllm, we need to use ("image" | "video") to specify list of images/videos
+ # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205
+ return ToolResponse(image=[img1, ...], video=[video1, ...], text="..."), 0, {}
+
+remeber to set ``return_multi_modal_inputs: False`` in your dataset config in order to process the multi-modal inputs in the rollout correctly.
+Refer to the `Handling Multi-Modal Inputs in Datasets`_ section for more details.
+
+MCP Tool Configuration
+~~~~~~~~~~~~~~~~~~~~~~
+
+For MCP interaction tools, you can flexibly configure them using a YAML file. The typical setup is as follows:
+
+.. code-block:: yaml
+
+ tools:
+ - class_name: ""
+ config:
+ type: mcp
+ mcp:
+ mcp_servers_config_path: ./mcp_server.json
+ tool_selected_list: {}
+
+The ``tool_selected_list`` field is optional and specifies which tools to use from the servers. If you want to enable all available tools, simply omit this attribute. Besides, ``mcp_servers_config_path`` points to a JSON file containing the MCP server configurations. For example:
+
+.. code-block:: json
+
+ {
+ "mcpServers": {
+ "SSE Server": {
+ "url": "your_server_url",
+ "auth_token": "your_server_api_token"
+ },
+ "STDIO Server": {
+ "command": "npx",
+ "args": ["-y", "server-mcp@0.2.1"],
+ "env": {
+ "SERVER_API_KEY": "your_server_api_token"
+ }
+ }
+ }
+ }
+
+Since the content formats returned by the MCP server may vary, users can inherit from ``MCPBaseTool`` and override the ``_parse_tool_result`` method to implement custom parsing logic.
+
+.. code-block:: python
+
+ class MCPYourTool(MCPBaseTool):
+ def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema):
+ super().__init__(config, tool_schema)
+
+ def _parse_tool_result(self, content: list) -> Tuple[str, dict]:
+ ...
+
+Overall, you may refer to mcp_search_tool.py_ and mcp_tool_config.yaml_ for custom implementation and configuration.
+
+Multi-turn Tokenization
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Tokenizing multi-turn rollouts poses a challenge: after applying the chat template and tokenizing the full message list, it's hard to identify which tokens belong to assistant messages. Since the token list is flat, it lacks direct alignment with the message roles.
+
+To address this, we adopt a **delta-based tokenization** strategy. Each time the LLM generates a new message, we:
+
+1. Apply the chat template to all prior messages (`messages[:i]`).
+2. Apply the chat template again including the latest message (`messages[:i+1]`).
+3. Tokenize only the *delta* between these two serialized message strings.
+
+This ensures that only tokens generated by the assistant are included in the loss mask.
+
+.. code-block:: python
+
+ # When using tokenizer
+ # Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True
+ prev = tokenizer.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False)
+ curr = tokenizer.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False)
+ token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
+ loss_mask += [1] * len(token_ids) # Mask only the new assistant tokens
+
+.. code-block:: python
+
+ # When using processor
+ # Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True
+ prev = processor.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False)
+ prev_model_inputs = processor(text=prev, images=images, videos=videos, return_tensors="pt")[0].tolist()
+ curr = processor.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False)
+ curr_model_inputs = processor(text=curr, images=images, videos=videos, return_tensors="pt")[0].tolist()
+ token_ids += curr_model_inputs["input_ids"][len(prev_model_inputs["input_ids"]):]
+ loss_mask += [1] * len(token_ids) # Mask only the new assistant tokens
+
+While we've validated this produces consistent results with full message tokenization, future models' chat template could break compatibility. To guard against silent inconsistencies, we compare the delta-based tokenization with full-tokenization results by default at the end of each rollout.
+
+If you see the following warning, you can check the mismatched substring in the log:
+
+.. code-block::
+
+ Inconsistent training and inference tokenization detected. This may lead to unexpected behavior during training. Please review your chat template to determine if this is intentional. For more information, refer to the multiturn README.md.
+
+The tokenization sanity check mode can be configured using the ``actor_rollout_ref.rollout.multi_turn.tokenization_sanity_check_mode`` parameter, which accepts the following values:
+
+- ``strict`` (default): Performs strict comparison between delta-based and full tokenization results, raising warnings for any differences.
+
+- ``ignore_strippable``: Ignores differences in whitespace characters (``\n``, ``\t``, ``\r``, spaces) while still checking for meaningful text mismatches. This is useful when debugging chat template issues where whitespace variations are expected and acceptable.
+
+- ``disable``: Completely disables the tokenization sanity check. Only use this if you have thoroughly validated that tokenization discrepancies are expected and won't impact training.
+
+Example configuration:
+
+.. code-block:: yaml
+
+ actor_rollout_ref:
+ rollout:
+ multi_turn:
+ tokenization_sanity_check_mode: "ignore_strippable" # Choose from: "disable", "ignore_strippable", "strict"
+
+Handling Multi-Modal Inputs in Datasets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your dataset includes multi-modal inputs (such as images or videos), you can control whether these are pre-processed and included in each sample by setting the return_multi_modal_inputs flag in your dataset config (used by RLHFDataset).
+
+- ``return_multi_modal_inputs: True`` (default): The dataset will pre-process and include a multi_modal_inputs dictionary for each sample. This dict contains the model-ready representations (e.g., image tensors, video tensors, etc.) as produced by your processor. This is useful for single-turn or SFT-style training, where the model expects all modalities to be present in the batch.
+
+- ``return_multi_modal_inputs: False``: The dataset will not include the multi_modal_inputs field. This is recommended for multi-turn RL or tool-augmented rollouts, where the model may generate new multi-modal inputs dynamically during rollout, and you want to avoid conflicts or redundant data in the batch.
+
+
+Special Cases
+^^^^^^^^^^^^^
+
+Some models (e.g., Qwen/QwQ-32B and Qwen3 series) remove internal reasoning content during chat template rendering. As a result, the message content can vary across turns, making the delta-based tokenization inaccurate.
+
+For example, for the following conversation:
+
+.. code-block:: python
+
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": "What is 2 + 2?"},
+ {"role": "assistant", "content": "user asked about a simple math question. 2 + 2 = 4."},
+ {"role": "user", "content": "Explain why."},
+ {"role": "assistant", "content": "user wants to know the reasoning behind the answer. Search for a good explanation",
+ "tool_calls": [{"id": "tool1", "type": "search", "arguments": {"query": "Why is 2 + 2 = 4?"}}]},
+ {"role": "tool", "content": "The sum of two and two is four because it is a basic arithmetic operation."},
+ {"role": "assistant", "content": "The tool provided a good explanation.The sum of two and two is four because it is a basic arithmetic operation."}
+ ]
+
+1. Qwen/QwQ-32B will remove all reasoning content except the last assistant message after applying the chat template.
+
+.. code-block:: text
+
+ <|im_start|>system
+ You are a helpful assistant.<|im_end|>
+ <|im_start|>user
+ What is 2 + 2?<|im_end|>
+ <|im_start|>assistant
+ 2 + 2 = 4.<|im_end|>
+ <|im_start|>user
+ Explain why.<|im_end|>
+ <|im_start|>assistant
+
+ {"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
+ <|im_end|>
+ <|im_start|>user
+
+ The sum of two and two is four because it is a basic arithmetic operation.
+ <|im_end|>
+ <|im_start|>assistant
+ The tool provided a good explanation. The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
+
+2. Qwen3 series will remove all reasoning content before the last user message.
+
+.. code-block:: text
+
+ <|im_start|>system
+ You are a helpful assistant.<|im_end|>
+ <|im_start|>user
+ What is 2 + 2?<|im_end|>
+ <|im_start|>assistant
+ 2 + 2 = 4.<|im_end|>
+ <|im_start|>user
+ Explain why.<|im_end|>
+ <|im_start|>assistant
+
+ user wants to know the reasoning behind the answer. Search for a good explanation
+
+
+
+ {"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
+ <|im_end|>
+ <|im_start|>user
+
+ The sum of two and two is four because it is a basic arithmetic operation.
+ <|im_end|>
+ <|im_start|>assistant
+
+ The tool provided a good explanation.
+
+
+ The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
+
+To handle this, we fall back to a **fixed base conversation** containing only a single system and user message. Since this base doesn't include assistant messages or reasoning content, it remains consistent across turns.
+
+.. code-block:: python
+
+ BASE_CHAT_HISTORY = [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": "I am a user."}
+ ]
+ prev = tokenizer.apply_chat_template(BASE_CHAT_HISTORY, add_generation_prompt=True, tokenize=False)
+ curr = tokenizer.apply_chat_template([*BASE_CHAT_HISTORY, messages[i]], add_generation_prompt=False, tokenize=False)
+ token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
+ loss_mask += [1] * len(token_ids)
+
+This method works well for Qwen3 series. However, Qwen/QwQ-32B currently has a bug in its chat template. A fix_ has been proposed but not yet adopted. Until then, use the following command to download the fixed model revision:
+
+.. code-block:: bash
+
+ pip install huggingface_hub
+ hf download Qwen/QwQ-32B --revision refs/pr/81
+
+.. _fix: https://huggingface.co/Qwen/QwQ-32B/discussions/81
+
+Discrepancy Between Training and Inference Templates
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Although the above approach fixes the delta mismatch issue, the removal of reasoning content in the inference-time chat template introduces a new discrepancy: training uses the full reasoning content, while inference does not.
+
+This mismatch can affect model performance in unpredictable ways. To avoid it, we default to using the full response (including reasoning) for both training and rollout.
+
+However, this approach comes with trade-offs:
+
+1. Long reasoning contents can easily exceed the model's context window, especially in multi-turn rollout.
+2. There's a mismatch between rollout and production environment now—models will not have reasoning content from past turns if you use the default chat template in production.
+
+We are still evaluating the impact of these issues. If you experience context length problems or prefer rollouts that match production (i.e., exclude reasoning), you can enable:
+
+``actor_rollout_ref.rollout.multi_turn.use_inference_chat_template = True``
+
+GSM8K Multi-turn Training Performance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See the training performance of multi-turn rollout on the GSM8K task HERE_.
+
+.. _HERE: https://wandb.ai/zhaochenyang20/gsm8k_async_rl/runs/1ro1r7om?nw=nwuserzhaochenyang20
+
+.. _GSM8KTool_example_configuration: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml
+
+.. _gsm8k_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/gsm8k_tool.py
+
+.. _mcp_search_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/mcp_search_tool.py
+
+.. _mcp_tool_config.yaml: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/mcp_tool_config.yaml
+
+Interaction System
+~~~~~~~~~~~~~~~~~~
+
+For dynamic conversational feedback during RL training, see:
+
+.. toctree::
+ :maxdepth: 1
+
+ interaction_system
+
+Search Tool Integration
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. toctree::
+ :maxdepth: 1
+
+ search_tool_example
+
+Code Walkthrough
+~~~~~~~~~~~~~~~~~~~~~~~
+If you want to learn more in depth about the code execution flow, please read https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/rlhf/verl/multi-turn/code-walk-through
diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/sandbox_fusion.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/sandbox_fusion.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94adb8a356cbe98309b9287b7b255767c2bcd860
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/sandbox_fusion.rst
@@ -0,0 +1,304 @@
+===============================
+Sandbox Fusion Tool Integration
+===============================
+
+Last updated: 06/10/2025.
+
+Motivations
+===========
+
+- As users of verl, we want to allow the model to call certain tools during Actor rollout, incorporating the results into the training process.
+- A colleague from ByteDance proposed a paper aimed at enhancing model capability through code execution tools.
+- We aim to support tool-calling capabilities of inference engines using `sandbox-fusion` as the code execution system, providing the community with a reimplementation of `retools`.
+
+Reward Compute with Sandbox Fusion + FaaS Integration
+=====================================================
+
+- In current datasets and tasks, similar work already exists (e.g., Prime), which uses local processes as runners to execute model-generated code for reward computation.
+- On this basis, #1429 has advanced the design by integrating FaaS as the runner for reward computation.
+
+Goals
+=====
+
+- Adapt to the `sglang` tool-calling protocol and define tools for sandbox fusion.
+- Integrate with the `async-rollout` process, ensuring sandbox fusion tools follow asyncIO conventions.
+- Design and implement a basic rate limiter to prevent issues such as 429 errors.
+
+Non-Goals
+=========
+
+- Training effectiveness is out of scope.
+- Observability metrics are not considered.
+- Distributed failover and component fault tolerance are not addressed.
+
+Design Details
+==============
+
+Tool Schema Definition
+----------------------
+
+- Currently, only code execution is considered, requiring a `code` field in the JSON from the model.
+- Only Python code is supported for now, so no `language` parameter is defined.
+
+.. code-block:: python
+
+ OpenAIFunctionToolSchema(
+ type="function",
+ function=OpenAIFunctionSchema(
+ name="code_interpreter",
+ description="A tool for executing code.",
+ parameters=OpenAIFunctionParametersSchema(
+ type="object",
+ properties={
+ "code": OpenAIFunctionPropertySchema(
+ type="string",
+ description="The code to execute.",
+ enum=None,
+ )
+ },
+ required=["code"],
+ ),
+ strict=False,
+ )
+ )
+
+Configuration Parameters
+--------------------------
+
++----------------------------+--------------------------------------------------------------+
+| Parameter Name | Description |
++============================+==============================================================+
+| `num_workers` | Number of worker threads/processes per DP to request runner. |
++----------------------------+--------------------------------------------------------------+
+| `rate_limit` | Global limit of concurrent code executions. Default: 10 |
++----------------------------+--------------------------------------------------------------+
+| `default_timeout` | Timeout (in seconds) for each code execution. Default: 30 |
++----------------------------+--------------------------------------------------------------+
+| `default_language` | Default programming language. Default: "python" |
++----------------------------+--------------------------------------------------------------+
+| `enable_global_rate_limit` | Whether to enable global rate limiting. Default: True |
++----------------------------+--------------------------------------------------------------+
+| `sandbox_fusion_url` | URL for the veFaas sandbox execution service |
++----------------------------+--------------------------------------------------------------+
+
+Rate Limiting Design
+-----------------------
+
+Objective:
+
+- Limit the number of inflight requests using a token bucket model.
+
+- Ensure ordered submission to code runners to avoid starvation due to backoff.
+
+Design Highlights:
+
+- Use Ray Global Actor as a singleton distributed counter at cluster level.
+
+- Semaphore used for counting, with `acquire` and `release` in separate thread pools to preserve order.
+
+- Use Ray’s cloud-pickle to serialize functions for decoupled `ExecutionWorker`.
+
+.. code-block:: python
+
+ @ray.remote(concurrency_groups={"acquire": 1,"release": 10})
+ class TokenBucketWorker:
+ def __init__(self, rate_limit: int):
+ self.rate_limit = rate_limit
+ self.current_count = 0
+ self._semaphore = threading.Semaphore(rate_limit)
+
+ @ray.method(concurrency_group="acquire")
+ def acquire(self):
+ self._semaphore.acquire()
+ self.current_count += 1
+
+ @ray.method(concurrency_group="release")
+ def release(self):
+ self._semaphore.release()
+ self.current_count -= 1
+
+ def get_current_count(self):
+ return self.current_count
+
+ class ExecutionWorker:
+ def __init__(self, enable_global_rate_limit=True, rate_limit=10):
+ self.rate_limit_worker = self._init_rate_limit(rate_limit) if enable_global_rate_limit else None
+
+ def _init_rate_limit(self, rate_limit):
+ return TokenBucketWorker.options(name="rate-limiter", get_if_exists=True).remote(rate_limit)
+
+ def execute(self, fn: Callable[..., T], *fn_args, **fn_kwargs) -> T:
+ with ExitStack() as stack:
+ stack.callback(self.rate_limit_worker.release.remote)
+ ray.get(self.rate_limit_worker.acquire.remote())
+ try:
+ return fn(*fn_args, **fn_kwargs)
+ except Exception as e:
+ logger.warning(f"Error when executing code: {e}")
+
+ def init_execution_pool(num_workers: int, enable_global_rate_limit=True, rate_limit=10, mode: PoolMode=PoolMode.ThreadMode):
+ if mode == PoolMode.ThreadMode:
+ return ray.remote(ExecutionWorker).options(max_concurrency=num_workers).remote(
+ enable_global_rate_limit=enable_global_rate_limit,
+ rate_limit=rate_limit
+ )
+ else:
+ raise NotImplementedError("Process mode is not implemented yet")
+
+Tool Implementation
+-------------------
+
+- Use `instance_id` to identify requests across multiple dialogue rounds.
+
+- Use `execution_pool` to implement async invocation.
+
+- Cleanup state after rollout completion.
+
+.. code-block:: python
+
+ class SandboxFusionTool(BaseTool):
+ def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema):
+ ...
+ self.execution_pool = init_execution_pool(...)
+ ...
+
+ async def create(self, instance_id: Optional[str] = None, ...):
+ ...
+
+ async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> Tuple[str, float, dict]:
+ code = parameters.get("code", "")
+ timeout = parameters.get("timeout", self.default_timeout)
+ language = parameters.get("language", self.default_language)
+ if not isinstance(code, str):
+ code = str(code)
+
+ result = await self.execution_pool.execute.remote(self.execute_code,instance_id,code,timeout,language)
+ self._instance_dict[instance_id]["reward"].append(result.strip())
+
+ return result, result, {}
+
+ def execute_code(self,instance_id,code,timeout=30,language="python"):
+ result_status, metadata = _process_single_case(0, None, None,self.sandbox_fusion_url, code, timeout, language)
+ # we should always expect this since we don't have correct answer
+ if metadata["run_status"] == "Finished":
+ actual_output = metadata["stdout"] if metadata["stdout"] is not None else ""
+ return actual_output
+ else:
+ return "no stdout here"
+
+ async def calc_reward(self, instance_id: str, ...):
+ ...
+
+ async def release(self, instance_id: str, ...):
+ ...
+
+Test Plan
+=========
+
+Unit Tests
+----------
+
+- **test_tools_registration**: Test tool registration and initialization.
+- **test_rollout_req_creation**: Validate that `AsyncRolloutReq` is built correctly.
+- **test_over_size_case**: Ensure rollout terminates early when exceeding `max_seq_len`.
+- **test_tool_call_basic_case**: Mock `sglang` output, validate tool call and result.
+- **test_tool_call_batch_case**: Test batch processing of tool calls.
+- **test_basic_multi_process_init**: Validate Ray global actor behaves as singleton.
+- **TestSingleNodeRateLimiterCase**: Verify rate limiter works in single-node mode.
+- **test_rotten_execution**: Ensure rate limiter recovers from function errors.
+- **TestMultiNodeRateLimiterCase**: Verify behavior in multi-node environments.
+
+e2e Tests
+----------
+we provide e2e test scripts in `tests/special_e2e` folder, named `tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh`
+
+by setting 'trainer.rollout_data_dir' you can dump the rollout data to local disk. here is an sample taken from the rollout data:
+
+.. code-block:: python
+
+ {
+ "input": "
+
+ system\nYou are a math expert. You are given a question and you need to solve it step by step. Reasoning step by step before any tool call. You should use the `calc_gsm8k_reward` tool after step by step solving the question, before generate final answer at least once and refine your answer if necessary. Put your final answer in the format of `#### `.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n{\"type\": \"function\", \"function\": {\"name\": \"code_interpreter\", \"description\": \"A tool for executing code.\", \"parameters\": {\"type\": \"object\", \"properties\": {\"code\": {\"type\": \"string\", \"description\": \"The code to execute.\", \"enum\": null}}, \"required\": [\"code\"]}, \"strict\": false}}\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n
+
+ user\nJohn gets a bonus that's the same percentage every year. Last year he made $100,000 and got a $10,000 bonus. This year he makes $200,000. How much will John make this year when adding both his total pay and bonus together? Let's think step by step and output the final answer after `####`.\n
+
+ assistant\n",
+ "output": "assistant\n\nOkay, let's see. John gets a bonus that's the same percentage every year. Last year, he made $100,000 and got a $10,000 bonus. This year, he makes $200,000. The question is asking how much he'll make this year when adding both his total pay and bonus together.\n\nFirst, I need to figure out the percentage bonus he gets. Since the bonus is the same percentage each year, if I can find that percentage based on last year's details, I can apply it to this year's earnings to find this year's bonus. Then, I'll add that bonus to his total pay this year.\n\nLast year, he earned $100,000 and got a $10,000 bonus. To find the percentage, I can divide the bonus by his total earnings. So, 10,000 divided by 100,000. Let me calculate that.\n\n10,000 / 100,000 equals 0.1, which is 10% as a percentage. So, his bonus is 10% of his pay each year.\n\nThis year, he makes $200,000. So, the bonus this year would be 10% of 200,000. Let me compute that using the code interpreter to ensure accuracy.\n
+
+ \n{\"name\": \"code_interpreter\", \"arguments\": {\"code\": \"total_pay_this_year = 200000\\nbonus_percentage = 10 / 100\\nbonus_this_year = total_pay_this_year * bonus_percentage\\ntotal_income_this_year = total_pay_this_year + bonus_this_year\\nprint(total_income_this_year)\", \"executes\": \"True\"}}\n\n
+
+ tool\n220000.0\n\n
+
+ assistant\n\nSo the code shows that John's total income this year would be $220,000. This is consistent with the 10% bonus on last year's pay of $100,000, which was $10,000. Therefore, this year's bonus of $20,000 added to his total pay of $200,000 results in a total income of $220,000.\n\nI need to verify that there are no other factors at play. The problem states it's a constant percentage bonus each year, so there's no mention of changing percentages or other variables. Therefore, applying a straightforward percentage of 10% on this year's partial salary amount would result in $20,000. Adding that to his total pay gives $220,000. The calculation is correct and aligns with the question's parameters.\n\n\n#### 220000.0",
+ "score": 0,
+ "step": 1
+ }
+
+here is the readable format version:
+
+.. code-block:: python
+
+ [system]
+
+ You are a math expert. You are given a question and you need to solve it step by step. Reasoning step by step before any tool call. You should use the `calc_gsm8k_reward` tool after step by step solving the question, before generate final answer at least once and refine your answer if necessary. Put your final answer in the format of `#### `.
+
+ # Tools
+
+ You may call one or more functions to assist with the user query.
+
+ You are provided with function signatures within XML tags:
+
+ {"type": "function", "function": {"name": "code_interpreter", "description": "A tool for executing code.", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": "The code to execute.", "enum": null}}, "required": ["code"]}, "strict": false}}
+
+
+ For each function call, return a json object with function name and arguments within XML tags:
+
+ {"name": , "arguments": }
+
+
+ [user]
+
+ John gets a bonus that's the same percentage every year. Last year he made $100,000 and got a $10,000 bonus. This year he makes $200,000. How much will John make this year when adding both his total pay and bonus together? Let's think step by step and output the final answer after `####`.
+
+ [assistant]
+
+
+ Okay, let's see. John gets a bonus that's the same percentage every year. Last year, he made $100,000 and got a $10,000 bonus. This year, he makes $200,000. The question is asking how much he'll make this year when adding both his total pay and bonus together.
+
+ First, I need to figure out the percentage bonus he gets. Since the bonus is the same percentage each year, if I can find that percentage based on last year's details, I can apply it to this year's earnings to find this year's bonus. Then, I'll add that bonus to his total pay this year.
+
+ Last year, he earned $100,000 and got a $10,000 bonus. To find the percentage, I can divide the bonus by his total earnings. So, 10,000 divided by 100,000. Let me calculate that.
+
+ 10,000 / 100,000 equals 0.1, which is 10% as a percentage. So, his bonus is 10% of his pay each year.
+
+ This year, he makes $200,000. So, the bonus this year would be 10% of 200,000. Let me compute that using the code interpreter to ensure accuracy.
+
+ {"name": "code_interpreter", "arguments": {"code": "total_pay_this_year = 200000\nbonus_percentage = 10 / 100\nbonus_this_year = total_pay_this_year * bonus_percentage\ntotal_income_this_year = total_pay_this_year + bonus_this_year\nprint(total_income_this_year)", "executes": "True"}}
+
+
+ [tool]
+
+ 220000.0
+
+ [assistant]
+
+
+ So the code shows that John's total income this year would be $220,000. This is consistent with the 10% bonus on last year's pay of $100,000, which was $10,000. Therefore, this year's bonus of $20,000 added to his total pay of $200,000 results in a total income of $220,000.
+
+ I need to verify that there are no other factors at play. The problem states it's a constant percentage bonus each year, so there's no mention of changing percentages or other variables. Therefore, applying a straightforward percentage of 10% on this year's partial salary amount would result in $20,000. Adding that to his total pay gives $220,000. The calculation is correct and aligns with the question's parameters.
+
+
+ #### 220000.0
+
+
+You can also use the `RolloutViewer` TUI tool to view the dumped rollout data:
+
+
+.. code-block:: bash
+
+ python scripts/rollout_viewer.py ${trainer.rollout_data_dir}
+
+
+.. image:: https://github.com/user-attachments/assets/e34e5157-2880-4a21-afb2-73885d0dfb11
+ :alt: RolloutViewer screenshot
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/search_tool_example.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/search_tool_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cbbdeb0d08e6102a00a85bd5544c345bb086969f
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/search_tool_example.rst
@@ -0,0 +1,264 @@
+=======================
+Search Tool Integration
+=======================
+
+Last updated: 05/30/2025.
+
+Introduction
+------------
+- We have added a search tool calling function to Multi-Turn RL, enabling the model to initiate retrieval requests during Actor rollout and directly use retrieval results for training. **We support using a local dense retriever as the retrieval tool, as well as integrating with your own local retrieval engine.**
+
+
+
+Quick Reproduction
+------------------
+
+Create a New Docker Container
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bash
+
+ docker run \
+ -it \
+ --shm-size 32g \
+ --gpus all \
+ -v {Huggingface-Cache-Path}:/root/.cache \
+ --ipc=host \
+ --network=host \
+ --privileged \
+ --name sglang_{your-name} \
+ lmsysorg/sglang:dev \
+ /bin/zsh
+
+If you need to restart after exiting the container:
+
+.. code:: bash
+
+ docker start -i sglang_{your-name}
+
+Update Python and Configure the Virtual Environment using uv
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bash
+
+ apt update
+ apt install -y python3.10 python3.10-venv
+
+ # Create a virtual environment
+ python3 -m venv ~/.python/verl-multiturn-rollout
+
+ # Activate the virtual environment
+ source ~/.python/verl-multiturn-rollout/bin/activate
+
+ # Install uv
+ python3 -m pip install uv
+
+Install verl Upstream
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bash
+
+ cd ~
+ git clone https://github.com/volcengine/verl.git
+ cd verl
+
+ # Install verl
+ python3 -m uv pip install .
+ python3 -m uv pip install -r ./requirements_sglang.txt
+
+ # Manually install flash-attn
+ python3 -m uv pip install wheel
+ python3 -m uv pip install packaging
+ python3 -m uv pip install flash-attn --no-build-isolation --no-deps
+
+Set Up a Local Retrieval Engine
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you are using your own local retrieval service, you can skip this
+step. We chose the local dense retriever provided in the search-R1
+example; detailed instructions are in the `searchR1
+docs `__.
+In brief:
+
+- The GPU version offers higher accuracy and speed; each GPU uses about
+ 5–7 GB of memory.
+- The CPU version can be used for simple testing but has lower
+ retrieval precision, which will degrade training performance. See the
+ `retriever
+ documentation `__
+ in search-R1 for details.
+- Recommend using Conda to install faiss-gpu=1.8.0; venv may cause errors.
+
+**Note**: To start both the training process and the local retrieval
+service, we launch two separate Python environments. The training uses
+uv in the verl-multiturn-rollout environment, while the retriever uses
+conda to install ``faiss-gpu``.
+
+.. code:: bash
+
+ # Download the Miniconda installer script
+ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
+
+ # Install to $HOME/miniconda3 in batch mode
+ bash ~/miniconda.sh -b -p $HOME/miniconda3
+
+ # Activate conda (only in the current shell)
+ eval "$($HOME/miniconda3/bin/conda shell.bash hook)"
+
+ # (Optional) Add conda to your default shell startup
+ conda init
+
+ # Reload shell config
+ source ~/.bashrc
+
+ # Create and activate the retriever environment with Python 3.10
+ conda create -n retriever python=3.10 -y
+ conda activate retriever
+
+ # Install PyTorch (with GPU support) and related libraries
+ conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia -y
+
+ # Install other Python packages
+ pip install transformers datasets pyserini huggingface_hub
+
+ # Install the GPU version of faiss
+ conda install faiss-gpu=1.8.0 -c pytorch -c nvidia -y
+
+ # Install the API service framework
+ pip install uvicorn fastapi
+
+Download the Indexing and Corpus
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The local retrieval files are large—prepare sufficient disk space.
+Downloading is about 60–70 GB, and uncompressed takes about 132 GB:
+
+.. code:: bash
+
+ conda activate retriever
+
+ save_path=/the/path/to/save
+ python examples/sglang_multiturn/search_r1_like/local_dense_retriever/download.py --save_path $save_path
+ cat $save_path/part_* > $save_path/e5_Flat.index
+ gzip -d $save_path/wiki-18.jsonl.gz
+
+Start the Local flat e5 Retrieval Server
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. The first startup will download models and load the index.
+2. Apart from the download, startup takes about 1–2 minutes.
+3. After startup, each GPU uses about 5–7 GB of memory, leaving the rest
+ for multi-turn RL training.
+
+.. code:: bash
+
+ conda activate retriever
+
+ index_file=$save_path/e5_Flat.index
+ corpus_file=$save_path/wiki-18.jsonl
+ retriever_name=e5
+ retriever_path=intfloat/e5-base-v2
+
+ python examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py \
+ --index_path $index_file \
+ --corpus_path $corpus_file \
+ --topk 3 \
+ --retriever_name $retriever_name \
+ --retriever_model $retriever_path \
+ --faiss_gpu
+
+Set Up WANDB_API_KEY
+~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bash
+
+ export WANDB_API_KEY={YOUR_WANDB_API_KEY}
+
+ # Define a timestamp function
+ function now() {
+ date '+%Y-%m-%d-%H-%M'
+ }
+
+**Preprocess the Dataset**
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ **Note:** The following data processing and training commands must be
+ run in the verl-multiturn-rollout environment.
+
+.. code:: bash
+
+ python3 examples/data_preprocess/preprocess_search_r1_dataset.py
+
+Testing on 8 x H20
+~~~~~~~~~~~~~~~~~~
+
+.. code:: bash
+
+ # Ensure the now() function is defined
+ # Create a logs directory
+ mkdir -p logs
+
+ # Set GPUs and run with a suitable log path
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+ nohup bash examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh \
+ trainer.experiment_name=qwen2.5-3b-it_rm-searchR1-like-sgl-multiturn-$(now) \
+ > logs/searchR1-like$(now).log 2>&1 &
+
+Custom Search Configuration
+---------------------------
+
+To enable multi-turn reasoning, set the following fields in your config:
+
+.. code:: yaml
+
+ actor_rollout_ref:
+ rollout:
+ name: "sglang"
+ multi_turn:
+ enable: True
+
+You must specify ``retrieval_service_url`` in ``examples/sglang_multiturn/config/tool_config/search_tool_config.yaml``, and properly configure concurrency. For more details on concurrency, refer to the Sandbox Fusion example:
+
+.. code:: yaml
+
+ tools:
+ - class_name: verl.tools.search_tool.SearchTool
+ config:
+ retrieval_service_url: http://127.0.0.1:8000/retrieve
+ num_workers: 120
+ rate_limit: 120
+ timeout: 30
+
+The retriever input/output formats are as follows. If your service
+parameters match, only modify ``retrieval_service_url``. You can also
+customize in ``search_r1_like_utils.py``.
+
+.. code:: python
+
+ Input format:
+ {
+ "queries": ["What is Python?", "Tell me about neural networks."],
+ "topk": 3,
+ "return_scores": true
+ }
+
+ Output format (when return_scores=True, similarity scores are returned):
+ {
+ "result": [
+ [ # Results for each query
+ {
+ "document": doc, "score": score
+ },
+ # ... more documents
+ ],
+ # ... results for other queries
+ ]
+ }
+
+Notes
+-----
+
+1. The total training time is about 27 hours; meanwhile, the validation
+ dataset is very large (51 k), and each validation takes about 6000 s.
+ (Therefore, ``val_before_train=False`` by default)
diff --git a/code/RL_model/verl/verl_train/docs/single_controller.rst b/code/RL_model/verl/verl_train/docs/single_controller.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d12177854e0ad2f2060a4255a4cde9cd93fe8263
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/single_controller.rst
@@ -0,0 +1,336 @@
+The Design of ``verl.single_controller``
+==============================================
+
+Last updated: 05/21/2025.
+
+**Author:**\ `Wang Zhang `__
+
+Preface
+-------
+
+We prepared this document for developers of ``verl``, particularly those
+interested in understanding or contributing to the
+``verl.single_controller`` module. It is not intended for end users, but
+for contributors seeking to understand the architectural rationale and
+internal mechanics.
+
+--------------
+
+Origin
+------
+
+The ``single_controller`` module originated from a request I received —
+to adapt a toy single-process RLHF script into a distributed system with
+minimal changes, while maintaining ease of debugging.
+
+Common practice — such as using PyTorch’s Distributed Data Parallel
+(DDP) — typically involves wrapping ``nn.Module`` and launching multiple
+processes that execute the same function under different ranks. However,
+this approach presents two main limitations in the context of
+distributed RLHF: - Difficulty representing multiple DAGs as required by
+PPO; - Difficulty inspecting intermediate tensors during training.
+
+To maintain debuggability, we opted for a different approach — breaking
+the training loop into well-defined stages like ``generate_sequences``,
+``compute_advantages``, and so on.
+
+We selected `Ray `__ as the initial backend for
+``verl`` due to its ability to expose Python class methods as RPC
+endpoints. However, Ray’s default model only supports **one method call,
+one RPC**, while training LLMs typically requires coordination across
+multiple processes.
+
+To hide this multi-Ray actors invocation for a single method from users,
+we introduced the following components:
+
+- ``WorkerGroup`` – manages a group of remote workers and provides
+ a unified interface for multi-process distributed computation;
+- ``ResourcePool`` – binds computational resources to worker
+ processes;
+- ``ClassWithArgs`` – enables delayed remote instantiation with
+ specified initialization arguments.
+
+--------------
+
+A Running Example: ``generate_sequences``
+-----------------------------------------
+
+To illustrate the design, we walk through how the ``generate_sequences``
+method in the ``ActorRolloutRefWorker`` class is registered and invoked
+across distributed workers.
+
+--------------
+
+Step 1: Register with a Decorator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The first step is to define the ``generate_sequences`` and decorate it
+with ``@register`` as it will be called in driver script.
+
+**Source:**
+`fsdp_workers.py `__
+
+.. code:: python
+
+ class ActorRolloutRefWorker(Worker):
+ ...
+ @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+ def generate_sequences(self, prompts: DataProto):
+ prompts = prompts.to(torch.cuda.current_device())
+ ...
+
+The ``@register`` decorator adds metadata to the ``generate_sequences``
+method. Currently, it doesn’t alter functionality, but attaches
+attributes via a magic key (``MAGIC_ATTR``):
+
+**Source:**
+`decorator.py `__
+
+.. code:: python
+
+ def register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.ALL, blocking=True, materialize_futures=True):
+ ...
+ def decorator(func):
+ @wraps(func)
+ def inner(*args, **kwargs):
+ if materialize_futures:
+ args, kwargs = _materialize_futures(*args, **kwargs)
+ return func(*args, **kwargs)
+
+ attrs = {"dispatch_mode": dispatch_mode, "execute_mode": execute_mode, "blocking": blocking}
+ setattr(inner, MAGIC_ATTR, attrs)
+ return inner
+
+ return decorator
+
+As the code shows, values of ``dispatch_mode``, ``execute_mode`` and
+``blocking`` is attached the ``generate_sequences`` method.
+
+--------------
+
+Step 2: Binding During Initialization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These attached attributes are extracted and utilized when
+``ActorRolloutRefWorker``, wrapped in a ``RayClassWithArgs``, is passed
+into a ``RayWorkerGroup``.
+
+**Source:**
+`main_generation.py `__
+
+.. code:: python
+
+ ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorRolloutRefWorker), config=config, role="rollout")
+ resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes)
+ wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
+
+During the
+`initialization `__
+of ``RayWorkerGroup``, two key steps occur:
+
+1. Worker instances (Ray actors) are created:
+ `RayWorkerGroup._init_with_resource_pool `__
+2. Methods decorated with ``@register`` are bound to ``RayWorkerGroup``:
+ `RayWorkerGroup._bind_worker_method `__
+
+.. figure:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/worker_group_init.png?raw=true
+ :alt: initialization_and_binding_of_worker_group
+
+ initialization_and_binding_of_worker_group
+
+The binding procedure is the heart of ``verl.single_controller``.
+
+**Key function:**
+`WorkerGroup._bind_worker_method `__
+
+.. code:: python
+
+ def _bind_worker_method(self, user_defined_cls, func_generator):
+ ...
+ for method_name in dir(user_defined_cls):
+ try:
+ method = getattr(user_defined_cls, method_name)
+ assert callable(method)
+ except Exception:
+ continue # Skip properties
+ <<>>
+
+When a method has the ``MAGIC_ATTR``, the attributes set by
+``@register`` are extracted:
+
+.. code:: python
+
+ <<>>
+ if hasattr(method, MAGIC_ATTR):
+ attribute = getattr(method, MAGIC_ATTR)
+ dispatch_mode = attribute["dispatch_mode"]
+ execute_mode = attribute["execute_mode"]
+ blocking = attribute["blocking"]
+
+ <<>>
+
+As show in the flow chart above, these attributes are fed into
+``func_generator``. However, ``func_generator`` takes ``method_name``,
+``dispatch_fn``, ``collect_fn``, ``execute_fn``, ``blocking``. We need
+to find the corresponding ``dispatch_fn`` and ``collect_fn`` associated
+with the ``dispatch_mode`` (``DP_COMPUTE_PROTO``) from
+`DISPATCH_MODE_FN_REGISTRY `__:
+
+.. code:: python3
+
+ DISPATCH_MODE_FN_REGISTRY = {
+ Dispatch.ONE_TO_ALL: {
+ "dispatch_fn": dispatch_one_to_all,
+ "collect_fn": collect_all_to_all,
+ },
+ ...
+ Dispatch.DP_COMPUTE_PROTO: {
+ "dispatch_fn": dispatch_dp_compute_data_proto,
+ "collect_fn": collect_dp_compute_data_proto,
+ },
+ ...
+ }
+
+Similarly, the ``execute_fn`` is selected by ``execute_mode`` and
+extracted by:
+
+.. code:: python
+
+ <<>>
+ # get execute_fn_name
+ execute_mode = get_predefined_execute_fn(execute_mode=execute_mode)
+ wg_execute_fn_name = execute_mode["execute_fn_name"]
+
+ # get execute_fn from string
+ try:
+ execute_fn = getattr(self, wg_execute_fn_name)
+ assert callable(execute_fn), "execute_fn must be callable"
+ except Exception:
+ print(f"execute_fn {wg_execute_fn_name} is invalid")
+ raise
+ <<>>
+
+In this ``generate_sequences`` cases: -
+``dispatch_mode = Dispatch.DP_COMPUTE_PROTO`` -
+``dispatch_fn = dispatch_dp_compute_data_proto`` -
+``collect_fn = collect_dp_compute_data_proto`` -
+``execute_fn = RayWorkerGroup.execute_all``
+
+ONE_TO_ALL v.s. DP_COMPUTE_PROTO
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``dispatch_mode`` is associated with a ``dispatch_fn`` and a
+``collect_fn``. As the name implies, ``dispatch_fn`` processes the input
+arguments in ``WorkerGroup`` and generate a batch (list) of input
+arguments, each of which will be fed into a worker attached to the
+``WorkerGroup``.
+
+``dispatch_fn`` of ``ONE_TO_ALL`` is
+`dispatch_one_to_all `__,
+which just duplicates all the input arguments into N replicas, where N
+equals the number of Workers attached to the ``worker_group``:
+
+.. code:: python
+
+ def dispatch_one_to_all(worker_group, *args, **kwargs):
+ args = tuple([arg] * worker_group.world_size for arg in args)
+ kwargs = {k: [v] * worker_group.world_size for k, v in kwargs.items()}
+ return args, kwargs
+
+``dispatch_fn`` of ``DP_COMPUTE_PROTO`` is
+`dispatch_dp_compute_data_proto `__,
+which uses ``DataProto.chunk`` to split a large ``DataProto`` into N
+smaller ``DataProto``, where N equals the world_size (number of the
+workers) of the ``worker_group``:
+
+.. code:: python
+
+ def dispatch_dp_compute_data_proto(worker_group, *args, **kwargs):
+ from verl.single_controller.base.worker_group import WorkerGroup
+
+ assert isinstance(worker_group, WorkerGroup)
+ # Note: enable auto padding for dp compute DatapProto
+ splitted_args, splitted_kwargs = _split_args_kwargs_data_proto_with_auto_padding(
+ worker_group.world_size,
+ *args,
+ **kwargs,
+ )
+ return splitted_args, splitted_kwargs
+
+The ``collect_fn`` follows the same pattern and process a batch (list)
+of returned value from all workers of a ``WorkerGroup`` and merge it
+into a list as ``collect_all_to_all`` does or a large ``DataProto`` as
+``collect_dp_compute_data_proto`` does.
+
+Finally, a new method is dynamically generated using ``func_generator``
+and added to the ``WorkerGroup`` instance:
+
+.. code:: python
+
+ <<>>
+ # bind a new method to the RayWorkerGroup
+ func = func_generator(
+ self,
+ method_name,
+ dispatch_fn=dispatch_fn,
+ collect_fn=collect_fn,
+ execute_fn=execute_fn,
+ blocking=blocking,
+ )
+
+ try:
+ setattr(self, method_name, func)
+ method_names.append(method_name)
+ except Exception as e:
+ raise ValueError(f"Fail to set method_name {method_name}") from e
+
+This makes the method invocable via the ``WorkerGroup`` interface.
+
+--------------
+
+Step 3: Call Chain
+~~~~~~~~~~~~~~~~~~
+
+All the machinery above ensures that distributed calls feel identical to
+single-process ones. In the original single-process script, the code
+looks like:
+
+.. code:: python
+
+ rollout = Rollout()
+ rollout.generate_sequences(batch)
+
+With ``verl``, the multiprocess program becomes:
+
+.. code:: python
+
+ rollout = RayWorkerGroup(resource_pool=[4], RayClassWithArgs(Rollout))
+ rollout.generate_sequences(batch)
+
+.. figure:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/call_generate_sequences.png?raw=true
+ :alt: call_chain_of_generate_sequences
+
+ call_chain_of_generate_sequences
+
+Behind this simple call: - ``dispatch_fn`` splits input across workers -
+``execute_fn`` performs the actual remote invocation - ``collect_fn``
+gathers the results
+
+All of this is abstracted away, enabling developers to write distributed
+code with minimal changes to their existing logic.
+
+--------------
+
+Beyond RL Post-Training: Generalizing ``verl.single_controller``
+----------------------------------------------------------------
+
+The ``verl.single_controller`` module generalizes well beyond
+reinforcement learning. It provides a clean abstraction to batch-process
+remote method calls, with automatic input/output handling.
+
+By minimizing the gap between single-process and multi-process scripts,
+``verl.single_controller`` opens the door to distributed computing in
+broader domains — not limited to RL post-training.
+
+We hope this design inspires more examples and extensions from the
+community.
diff --git a/code/RL_model/verl/verl_train/docs/start/agentic_rl.rst b/code/RL_model/verl/verl_train/docs/start/agentic_rl.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73c0a7ce1e1d8a43f9811b571b634fa94f162a10
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/start/agentic_rl.rst
@@ -0,0 +1,133 @@
+Agentic RL Training
+===================
+
+Last updated: 07/15/2025.
+
+Overview
+----------
+The goal of Agentic RL is to improve the performance of backend models from reinforcement learning to the Agent. During the training process, a series of features are developed:
+
+1. Server-based asynchronous rollout
+2. Multi-turn conversations and tool calls
+3. LangGraph-based Agent
+
+
+This document explains the system principles and usage involved to help users implement Agentic RL.
+
+
+Server-based Asynchronous Rollout
+---------------------------------
+
+Since Agents need to interact with the environment through various tool calls, in order to avoid GPU idling while waiting for tool call return results, an asyncio based co-routing mechanism is utilized to execute each rollout requests asynchronously, thereby improving training performance. To support asynchronous rollout, the inference engine (server) and the agent (client) are architecturally separated, implementing a server-based system with the following objectives:
+
+1. Enabling load balancing mechanisms to balance loads across multiple GPUs and reduce the impact of long-tail requests on performance. For this purpose, scheduling capabilities in stream mode (recipe\stream_mode) are implemented as a recipe.
+2. Preventing agent specific features such as tracing from affecting the inference engine.
+
+System Architecture
+~~~~~~~~~~~~~~~~~~~
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop.png?raw=true
+
+For more detail on internal design, please refer to :doc:`Agent Loop<../advance/agent_loop>`.
+
+System Components
+~~~~~~~~~~~~~~~~~
+
++--------------------------+----------------------------------------------------------------------------+
+| Component | Role |
++==========================+============================================================================+
+| AgentLoop | Client, implements Agent functions |
++--------------------------+----------------------------------------------------------------------------+
+| AsyncLLMServerManager | Inference gateway, provides generate interface for AgentLoop |
++--------------------------+----------------------------------------------------------------------------+
+| AsyncServer | Server, each instance is connected to one DP group of the inference engine |
++--------------------------+----------------------------------------------------------------------------+
+
+**"generate" Interface**
+
+The "generate" function based on ray actor is used between the Client and Server instead of the standard chat completion API. This is because the conversion between tokens and text can be irreversible. For example, the token converted from "" will be different from that generated by the LLM. During the training phase, it is necessary to strictly use the tokens generated by LLM inference to avoid inaccurate in computing advantage, which may affect model performance. Having the Server provide a token-based API helps the Client maintain the relationship between the text generated by tool calls and the tokens returned by the LLM, so as to output correct tokens for training.
+
+
+**Inference Engine Adaptation**
+AsyncServer uniformly provides a generate function to the upper layer, with separate implementations for SGLang and vLLM to hide underlying differences:
+
+1. The SGLang AsyncServer uses the async_generate interface of the SGLang engine, which is located on the first GPU of each TP group. Therefore, AsyncServer needs to remotely call async_generate through ray actor.
+2. The vLLM AsyncServer uses the generate interface of the vLLM engine, which can communicate with the GPUs in the TP group through ZMQ and can be directly called in AsyncServer.
+
+
+Usage Example
+~~~~~~~~~~~~~
+
+Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints.
+
+There are two options required to use agent loop:
+
+- `data.return_raw_chat=True`
+- `actor_rollout_ref.rollout.mode=async`
+
+This example uses the sglang inference engine by default, and you can also modify rollout_name to use vllm.
+
+.. code-block:: bash
+
+ bash examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
+
+
+Multi-turn Conversations and Tool Calls
+---------------------------------------
+
+Follow :doc:`Multi-turn Rollout Support<../sglang_multiturn/multiturn>` to prepare tool and configuration files.
+
+The Tool Agent Loop has an additional requirement: adding an "agent_name" field to the dataset. During rollout, it will choose to use tool_agent_loop or single_turn_agent (default) based on this field.
+
+Usage Example
+~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ # install mlflow to view toolcall and llm trace
+ pip install mlflow
+
+ # This will download and preprocess the GSM8K dataset into ~/data/gsm8k/ and add the "agent_name" field.
+ python examples/data_preprocess/gsm8k_tool_agent_loop.py
+
+ # Start training with tool calls and enabled mlflow based trace helping to debug the rollout details
+ bash examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh
+
+ # When training is done, start a mlflow server to view trace
+ mlflow ui -h 0.0.0.0 -p 5000 --backend-store-uri sqlite:////tmp/mlruns.db
+
+ # then you can open http://:5000 from browser to view trace
+
+
+Note: During training, because the model may sometimes fail to generate correct toolcall tags, an error message "Failed to decode tool call" will be output to the console, which does not indicate an abnormality in training.
+
+
+Follow :doc:`Rollout trace<../advance/rollout_trace>` to known more about trace feature.
+
+
+
+Agent Framework
+---------------
+
+System Architecture
+~~~~~~~~~~~~~~~~~~~
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/langgraph_agent.png?raw=true
+
+System Components
+~~~~~~~~~~~~~~~~~
+
++--------------------------+-----------------------------------------------------------------------------------------------+
+| Component | Role |
++==========================+===============================================================================================+
+| ChatModel | LLM object of LangChain, used to adapt to the “generate” api provided by AsyncLLMServerManager|
++--------------------------+-----------------------------------------------------------------------------------------------+
+| RectAgentLoop | Agent adaptation layer, which by default supports a naive LangGraph Agentic. |
+| | New classes can be derived to support user-defined Agents, and the run function needs to be |
+| | implemented to complete Agent calls. |
++--------------------------+-----------------------------------------------------------------------------------------------+
+| AsyncServer | Server, each instance is connected to one DP group of the inference engine. |
++--------------------------+-----------------------------------------------------------------------------------------------+
+
+
+Follow doc "recipe/langgraph_agent/example/README.md" for more details.
\ No newline at end of file
diff --git a/code/RL_model/verl/verl_train/docs/start/install.rst b/code/RL_model/verl/verl_train/docs/start/install.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2686713fbbef85c58da547fca27c42550748a684
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/start/install.rst
@@ -0,0 +1,319 @@
+Installation
+============
+
+Requirements
+------------
+
+- **Python**: Version >= 3.10
+- **CUDA**: Version >= 12.8
+
+verl supports various backends. Currently, the following configurations are available:
+
+- **FSDP** and **Megatron-LM** (optional) for training.
+- **SGLang**, **vLLM** and **TGI** for rollout generation.
+
+Choices of Backend Engines
+----------------------------
+
+1. Training:
+
+We recommend using **FSDP** backend to investigate, research and prototype different models, datasets and RL algorithms. The guide for using FSDP backend can be found in :doc:`FSDP Workers<../workers/fsdp_workers>`.
+
+For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support `Megatron-LM v0.13.1 `_. The guide for using Megatron-LM backend can be found in :doc:`Megatron-LM Workers<../workers/megatron_workers>`.
+
+
+2. Inference:
+
+For inference, vllm 0.8.3 and later versions have been tested for stability. We recommend turning on env var `VLLM_USE_V1=1` for optimal performance.
+
+For SGLang, refer to the :doc:`SGLang Backend<../workers/sglang_worker>` for detailed installation and usage instructions. SGLang rollout is under extensive development and offers many advanced features and optimizations. We encourage users to report any issues or provide feedback via the `SGLang Issue Tracker `_.
+
+For huggingface TGI integration, it is usually used for debugging and single GPU exploration.
+
+Install from docker image
+-------------------------
+
+Start from v0.6.0, we use vllm and sglang release image as our base image.
+
+Base Image
+::::::::::
+
+- vLLM: https://hub.docker.com/r/vllm/vllm-openai
+- SGLang: https://hub.docker.com/r/lmsysorg/sglang
+
+Application Image
+:::::::::::::::::
+
+Upon base image, the following packages are added:
+
+- flash_attn
+- Megatron-LM
+- Apex
+- TransformerEngine
+- DeepEP
+
+Latest docker file:
+
+- `Dockerfile.stable.vllm `_
+- `Dockerfile.stable.sglang `_
+
+All pre-built images are available in dockerhub: `verlai/verl `_. For example, ``verlai/verl:sgl055.latest``, ``verlai/verl:vllm011.latest``.
+
+You can find the latest images used for development and ci in our github workflows:
+
+- `.github/workflows/vllm.yml `_
+- `.github/workflows/sgl.yml `_
+
+
+Installation from Docker
+::::::::::::::::::::::::
+
+After pulling the desired Docker image and installing desired inference and training frameworks, you can run it with the following steps:
+
+1. Launch the desired Docker image and attach into it:
+
+.. code:: bash
+
+ docker create --runtime=nvidia --gpus all --net=host --shm-size="10g" --cap-add=SYS_ADMIN -v .:/workspace/verl --name verl sleep infinity
+ docker start verl
+ docker exec -it verl bash
+
+
+2. If you use the images provided, you only need to install verl itself without dependencies:
+
+.. code:: bash
+
+ # install the nightly version (recommended)
+ git clone https://github.com/volcengine/verl && cd verl
+ pip3 install --no-deps -e .
+
+[Optional] If you hope to switch between different frameworks, you can install verl with the following command:
+
+.. code:: bash
+
+ # install the nightly version (recommended)
+ git clone https://github.com/volcengine/verl && cd verl
+ pip3 install -e .[vllm]
+ pip3 install -e .[sglang]
+
+
+Install from custom environment
+---------------------------------------------
+
+We recommend to use docker images for convenience. However, if your environment is not compatible with the docker image, you can also install verl in a python environment.
+
+.. note::
+
+ - Dockerfile provides more details than this installation instructions. You can find examples in each Dockerfile, for example `verl0.6-cu128-torch2.8.0-fa2.7.4 Dockerfile.base `_ .
+
+
+Pre-requisites
+::::::::::::::
+
+For training and inference engines to utilize better and faster hardware support, CUDA/cuDNN and other dependencies are required,
+and some of the dependencies are easy to be overridden when installing other packages,
+so we put them in the :ref:`Post-installation` step.
+
+.. note::
+
+ - The installation steps below are recommended configurations for the latest version of verl.
+
+ If you are trying to customize your own environment, please ignore the strict constraints.
+
+We need to install the following pre-requisites:
+
+- **CUDA**: Version >= 12.8
+- **cuDNN**: Version >= 9.10.0
+- **Apex**
+
+CUDA above 12.8 is recommended to use as the docker image,
+please refer to `NVIDIA's official website `_ for other version of CUDA.
+
+.. code:: bash
+
+ # change directory to anywher you like, in verl source code directory is not recommended
+ wget https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb
+ dpkg -i cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb
+ cp /var/cuda-repo-ubuntu2204-12-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
+ apt-get update
+ apt-get -y install cuda-toolkit-12-8
+ update-alternatives --set cuda /usr/local/cuda-12-8
+
+
+cuDNN can be installed via the following command,
+please refer to `NVIDIA's official website `_ for other version of cuDNN.
+
+.. code:: bash
+
+ # change directory to anywher you like, in verl source code directory is not recommended
+ wget https://developer.download.nvidia.com/compute/cudnn/9.10.2/local_installers/cudnn-local-repo-ubuntu2204-9.10.2_1.0-1_amd64.deb
+ dpkg -i cudnn-local-repo-ubuntu2204-9.10.2_1.0-1_amd64.deb
+ cp /var/cudnn-local-repo-ubuntu2204-9.10.2/cudnn-*-keyring.gpg /usr/share/keyrings/
+ apt-get update
+ apt-get -y install cudnn-cuda-12
+
+Install dependencies
+::::::::::::::::::::
+
+.. note::
+
+ We recommend to use a fresh new conda environment to install verl and its dependencies.
+
+ **Notice that the inference frameworks often strictly limit your pytorch version and will directly override your installed pytorch if not paying enough attention.**
+
+ As a countermeasure, it is recommended to install inference frameworks first with the pytorch they needed. For vLLM, if you hope to use your existing pytorch,
+ please follow their official instructions
+ `Use an existing PyTorch installation `_ .
+
+
+1. First of all, to manage environment, we recommend using conda:
+
+.. code:: bash
+
+ conda create -n verl python==3.12
+ conda activate verl
+
+
+2. Then, execute the ``install.sh`` script that we provided in verl:
+
+.. code:: bash
+
+ # Make sure you have activated verl conda env
+ # If you need to run with megatron
+ bash scripts/install_vllm_sglang_mcore.sh
+ # Or if you simply need to run with FSDP
+ USE_MEGATRON=0 bash scripts/install_vllm_sglang_mcore.sh
+
+
+If you encounter errors in this step, please check the script and manually follow the steps in the script.
+
+[Optional] NVIDIA Apex is recommended for Megatron-LM training, but it's not needed if you only use FSDP backend.
+You can install it via the following command, but notice that this steps can take a very long time.
+It is recommended to set the ``MAX_JOBS`` environment variable to accelerate the installation process,
+but do not set it too large, otherwise the memory will be overloaded and your machines may hang.
+
+.. code:: bash
+
+ # change directory to anywher you like, in verl source code directory is not recommended
+ git clone https://github.com/NVIDIA/apex.git && \
+ cd apex && \
+ MAX_JOB=32 pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+Install verl
+::::::::::::
+
+For installing the latest version of verl, the best way is to clone and
+install it from source. Then you can modify our code to customize your
+own post-training jobs.
+
+.. code:: bash
+
+ git clone https://github.com/volcengine/verl.git
+ cd verl
+ pip install --no-deps -e .
+
+
+Post-installation
+:::::::::::::::::
+
+Please make sure that the installed packages are not overridden during the installation of other packages.
+
+The packages worth checking are:
+
+- **torch** and torch series
+- **vLLM**
+- **SGLang**
+- **pyarrow**
+- **tensordict**
+- **nvidia-cudnn-cu12**: For Magetron backend
+
+If you encounter issues about package versions during running verl, please update the outdated ones.
+
+
+Install with AMD GPUs - ROCM kernel support
+------------------------------------------------------------------
+
+When you run on AMD GPUs (MI300) with ROCM platform, you cannot use the previous quickstart to run verl. You should follow the following steps to build a docker and run it.
+If you encounter any issues in using AMD GPUs running verl, feel free to contact me - `Yusheng Su `_.
+
+Find the docker for AMD ROCm: `docker/Dockerfile.rocm `_
+::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+
+.. code-block:: bash
+
+ # Build the docker in the repo dir:
+ # docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
+ # docker images # you can find your built docker
+ FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+
+ # Set working directory
+ # WORKDIR $PWD/app
+
+ # Set environment variables
+ ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
+ # Install vllm
+ RUN pip uninstall -y vllm && \
+ rm -rf vllm && \
+ git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
+ cd vllm && \
+ MAX_JOBS=$(nproc) python3 setup.py install && \
+ cd .. && \
+ rm -rf vllm
+
+ # Copy the entire project directory
+ COPY . .
+
+ # Install dependencies
+ RUN pip install "tensordict<0.6" --no-deps && \
+ pip install accelerate \
+ codetiming \
+ datasets \
+ dill \
+ hydra-core \
+ liger-kernel \
+ numpy \
+ pandas \
+ datasets \
+ peft \
+ "pyarrow>=15.0.0" \
+ pylatexenc \
+ "ray[data,train,tune,serve]" \
+ torchdata \
+ transformers \
+ wandb \
+ orjson \
+ pybind11 && \
+ pip install -e . --no-deps
+
+Build the image
+::::::::::::::::::::::::
+
+.. code-block:: bash
+
+ docker build -t verl-rocm .
+
+Launch the container
+::::::::::::::::::::::::::::
+
+.. code-block:: bash
+
+ docker run --rm -it \
+ --device /dev/dri \
+ --device /dev/kfd \
+ -p 8265:8265 \
+ --group-add video \
+ --cap-add SYS_PTRACE \
+ --security-opt seccomp=unconfined \
+ --privileged \
+ -v $HOME/.ssh:/root/.ssh \
+ -v $HOME:$HOME \
+ --shm-size 128G \
+ -w $PWD \
+ verl-rocm \
+ /bin/bash
+
+If you do not want to root mode and require assign yourself as the user,
+Please add ``-e HOST_UID=$(id -u)`` and ``-e HOST_GID=$(id -g)`` into the above docker launch script.
+
+verl with AMD GPUs currently supports FSDP as the training engine, vLLM and SGLang as the inference engine. We will support Megatron in the future.
diff --git a/code/RL_model/verl/verl_train/docs/start/more_resources.rst b/code/RL_model/verl/verl_train/docs/start/more_resources.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aa8cb2a62b46579ee4bef2880d7f62485175495e
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/start/more_resources.rst
@@ -0,0 +1,7 @@
+More Resources
+==============
+
+Last updated: 06/30/2025.
+
+- Introduction to verl (`Slides `_)
+- verl Code Walkthrough (`Slides `_, `Talk in Chinese `_)
diff --git a/code/RL_model/verl/verl_train/docs/start/multinode.rst b/code/RL_model/verl/verl_train/docs/start/multinode.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4dd7d174aa465b966dfa41fff9c5d1fc1de0edff
--- /dev/null
+++ b/code/RL_model/verl/verl_train/docs/start/multinode.rst
@@ -0,0 +1,821 @@
+Multinode Training
+==================
+
+Last updated: 06/10/2025.
+
+.. _wuxibin89: https://github.com/wuxibin89
+
+Author: `Xibin Wu `_, `Yusheng Su `_.
+
+Option 1: Launch Manually
+------------------------------
+
+Set up multinode ray cluster
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+1. Start head node with ``ray start --head --dashboard-host=0.0.0.0``, there're 2 address you should care about:
+
+- GCS address: ``ray start --address=``, where worker node should connect to.
+- Dashboard address: ``:8265``, where you should submit job to the cluster.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/head.png?raw=true
+
+2. Start worker node with ``ray start --address=`` you get above.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/worker.png?raw=true
+
+3. Now you should see the cluster have 2 nodes with ``ray status``.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/status.png?raw=true
+
+4. Additionally, you can access dashboard in the browser with the address you get above.
+
+*Firewall rules maybe need configure to access the dashboard, if there's any trouble, please contact your network administrator.*
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/overview.png?raw=true
+
+Submit job to ray cluster
+~~~~~~~~~~~~~~~~~~~~~~~~~
+1. Submit ray job to cluster with the dashboard address you get above.
+
+.. code-block:: bash
+
+ ray job submit --address="http://127.0.0.1:8265" \
+ --runtime-env=verl/trainer/runtime_env.yaml \
+ --no-wait \
+ -- \
+ python3 -m verl.trainer.main_ppo \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=2 \
+ ...
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/submit.png?raw=true
+
+2. Then you can check the job status with the following commands:
+
+- ray job list: list all jobs submitted to the cluster.
+- ray job logs : query the logs of the job.
+- ray job status : query the status of the job.
+- ray job stop : request the job to be stopped.
+- ray job list | grep submission_id | grep JobStatus | grep RUNNING | grep -oP 'raysubmit_[^'\''"]+' | head -n 1: get the latest job submission ID of the running job.
+- ray job logs --follow: added ``--follow`` parameter to ray job logs command to enable continuous log streaming.
+
+3. You can also access driver/task/actor logs in ``/tmp/ray/session_latest/logs/``, driver log is ``job-driver-raysubmit_.log``.
+
+4. We strongly recommend you to view job detail from dashboard in multinode training, because it provide more structure way to view the job information.
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/job.png?raw=true
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/job_detail.png?raw=true
+
+Option 2: Launch via SkyPilot on Kubernetes or clouds
+------------------------------------------------------
+
+.. note::
+ Ready-to-use SkyPilot example configurations are available in the `examples/skypilot/ `_ directory:
+
+ - ``verl-ppo.yaml`` - PPO training with GSM8K dataset
+ - ``verl-grpo.yaml`` - GRPO training with MATH dataset
+ - ``verl-multiturn-tools.yaml`` - Multi-turn tool usage training
+
+ See the `SkyPilot examples README `_ for detailed usage instructions.
+
+Step 1: Setup SkyPilot
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+SkyPilot can support different clouds, here we use GCP as example. `install skypilot `_
+
+.. code-block:: bash
+
+ conda create -y -n sky python=3.10
+ conda activate sky
+ pip install "skypilot[gcp]"
+
+ conda install -c conda-forge google-cloud-sdk
+ gcloud init
+
+ # Run this if you don't have a credential file.
+ # This will generate ~/.config/gcloud/application_default_credentials.json.
+ gcloud auth application-default login
+
+ # Check if the GCP credential is correctly setup.
+ sky check gcp
+
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/setup_skypilot.png?raw=true
+
+Step 2: Prepare dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ git clone https://github.com/volcengine/verl.git
+ cd examples/data_preprocess
+ python3 gsm8k.py --local_save_dir ~/data/gsm8k
+
+
+Step 3: Submit a job with SkyPilot
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+1. Create a SkyPilot YAML ``verl-cluster.yml`` with the following content:
+
+.. parsed-literal:: workdir: . will sync all the data in the current dir to the remote cluster.
+
+.. code-block:: yaml
+
+ resources:
+ accelerators: L4:1 # every node has 1 L4 GPU
+ image_id: docker:verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4
+ memory: 64+ # every node has 64 GB memory
+ ports: 8265 # expose port for ray dashboard
+
+ num_nodes: 2 # cluster size
+
+ # --------------- Work Directory Synchronization (workdir) ---------------
+ # Defines the local working directory to be synchronized to the remote cluster.
+ # Here, '.' means synchronizing the directory where the sky submit command is currently run.
+ workdir: .
+
+ # --------------- (secrets) ---------------
+ secrets:
+ ## your wandb api key ##
+ WANDB_API_KEY: null
+
+ # --------------- File Mounts/Data Upload (file_mounts) ---------------
+ # If your dataset (gsm8k folder) is local, it needs to be uploaded to the remote cluster.
+ file_mounts:
+ # Remote path (relative to remote user's home directory): Local path
+ # /remote/dir1/file: /local/dir1/file
+ data/gsm8k: ~/data/gsm8k
+
+ # --------------- Environment Setup (setup) ---------------
+ # Commands run on each node of the remote cluster to set up the environment (e.g., install dependencies). These are run directly inside Docker.
+ setup: |
+ rm -rf verl
+ git clone https://github.com/volcengine/verl.git
+ cd verl
+ pip3 install -v -e .[vllm]
+
+ # --------------- Run Command (run) ---------------
+ # The actual task commands to be executed on the remote cluster.
+ # This script will first start the Ray cluster (different ray start commands are executed on Head and Worker nodes).
+ # Then, your training script will only be run on the Head node (SKYPILOT_NODE_RANK == 0).
+ run: |
+ # Get the Head node's IP and total number of nodes (environment variables injected by SkyPilot).
+ head_ip=`echo "$SKYPILOT_NODE_IPS" | head -n1`
+ num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` # Here num_nodes should be equal to 2.
+
+ # login wandb
+ python3 -c "import wandb; wandb.login(relogin=True, key='$WANDB_API_KEY')"
+
+ # Start Ray based on node role (Head=0, Worker>0).
+ # This logic is a standard Ray cluster startup script.
+ if [ "$SKYPILOT_NODE_RANK" == "0" ]; then
+ # Head node starts Ray Head.
+ echo "Starting Ray head node..."
+ # Check if a Ray Head is already running to avoid duplicate starts.
+ ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats \
+ --port=6379 \
+ --dashboard-host=0.0.0.0 \
+ --dashboard-port=8265
+
+ # Wait for all worker nodes to join the cluster.
+ while [ $(ray nodes | grep NODE_ID | wc -l) -lt $num_nodes ]; do
+ echo "Waiting for all nodes to join... ($(ray nodes | grep NODE_ID | wc -l)/$num_nodes)"
+ sleep 5
+ done
+
+ # Head node executes the training script.
+ echo "Executing training script on head node..."
+
+ python3 -m verl.trainer.main_ppo \
+ data.train_files=data/gsm8k/train.parquet \
+ data.val_files=data/gsm8k/test.parquet \
+ data.train_batch_size=256 \
+ data.max_prompt_length=512 \
+ data.max_response_length=256 \
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.name=vllm \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+ critic.optim.lr=1e-5 \
+ critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+ critic.ppo_micro_batch_size_per_gpu=4 \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ trainer.logger=['console','wandb'] \
+ trainer.val_before_train=False \
+ trainer.default_hdfs_dir=null \
+ trainer.n_gpus_per_node=1 \
+ trainer.nnodes=2 \
+ trainer.save_freq=20 \
+ trainer.test_freq=20 \
+ trainer.total_epochs=2 \
+ trainer.project_name=verl_examples \
+ trainer.experiment_name=experiment_name_gsm8k
+
+ else
+ # Wait for Ray Head to start.
+ sleep 10 # Increase waiting time to ensure Head finishes starting.
+ # Worker node starts Ray Worker.
+ echo "Starting Ray worker node..."
+
+ # Check if a Ray Worker is already running to avoid duplicate starts.
+ ps aux | grep ray | grep $head_ip:6379 &> /dev/null || ray start --address $head_ip:6379 --disable-usage-stats
+
+ # Add sleep to after `ray start` to give ray enough time to daemonize
+ sleep 5 # Ensure Worker successfully connects to Head.
+ fi
+
+ # No commands are added to the Worker node here; the Worker's main task is to start Ray and wait for the Head node to assign tasks.
+ echo "Node setup and Ray start script finished for rank $SKYPILOT_NODE_RANK."
+
+
+.. code-block:: bash
+
+ export WANDB_API_KEY=
+ sky launch -c verl --secret WANDB_API_KEY verl-cluster.yml
+
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/running_job.png?raw=true
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/running_job_1.png?raw=true
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/finished.png?raw=true
+
+**Check the cluster on GCP**
+
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/gcp_instances.png?raw=true
+
+**Check Ray Dashboard**
+
+We can see the cluster on the RAY Dashboard with the GCP head node:
+
+```console
+$ sky status --endpoint 8265 verl
+1.2.3.4:8265
+```
+
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/ray_dashboard_overview.png?raw=true
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/ray_dashboard_jobs.png?raw=true
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/ray_dashboard_cluster.png?raw=true
+
+
+**Check the checkpoint of model**
+
+.. code-block:: bash
+
+ # login the head node
+ ssh verl
+ # The global step will vary. Find the correct path from the training logs.
+ cd ~/sky_workdir/checkpoints/verl_examples/gsm8k/
+ # Then list contents to find the checkpoint, e.g.:
+ ls -R .
+
+.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/saved_model.png?raw=true
+
+
+Option 3: Launch via Slurm
+------------------------------
+
+Ray provides users with `this `_ official
+tutorial to start a Ray cluster on top of Slurm. We have verified the :doc:`GSM8K example<../examples/gsm8k_example>`
+on a Slurm cluster under a multi-node setting with the following steps.
+
+1. [Optional] If your cluster support `Apptainer or Singularity `_ and you wish
+to use it, convert verl's Docker image to an Apptainer image. Alternatively, set up the environment with the package
+manager available on your cluster or use other container runtimes (e.g. through `Slurm's OCI support `_) available to you.
+
+.. code:: bash
+
+ apptainer pull /your/dest/dir/vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3.sif docker://verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+
+2. Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints.
+
+3. Modify `examples/slurm/ray_on_slurm.slurm `_ with your cluster's own information.
+
+4. Submit the job script to the Slurm cluster with `sbatch`.
+
+Please note that Slurm cluster setup may vary. If you encounter any issues, please refer to Ray's
+`Slurm user guide `_ for common caveats.
+
+If you changed Slurm resource specifications, please make sure to update the environment variables in the job script if necessary.
+
+
+Option 4: Launch via dstack
+------------------------------
+
+`dstackai/dstack `_ is an open-source container orchestrator that simplifies distributed training across cloud providers and on-premises environments
+without the need to use K8S or Slurm.
+
+Prerequisite
+~~~~~~~~~~~~
+Once dstack is `installed `_, initialize the directory as a repo with ``dstack init``.
+
+.. code-block:: bash
+
+ mkdir myproject && cd myproject
+ dstack init
+
+**Create a fleet**
+
+Before submitting distributed training jobs, create a `dstack` `fleet `_.
+
+Run a Ray cluster task
+~~~~~~~~~~~~~~~~~~~~~~
+
+Once the fleet is created, define a Ray cluster task, e.g. in ``ray-cluster.dstack.yml``:
+
+.. code-block:: yaml
+
+ type: task
+ name: ray-verl-cluster
+
+ nodes: 2
+
+ env:
+ - WANDB_API_KEY
+ - PYTHONUNBUFFERED=1
+ - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+ image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
+ commands:
+ - git clone https://github.com/volcengine/verl
+ - cd verl
+ - pip install --no-deps -e .
+ - pip install hf_transfer hf_xet
+ - |
+ if [ $DSTACK_NODE_RANK = 0 ]; then
+ python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
+ python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2.5-7B-Instruct')"
+ ray start --head --port=6379;
+ else
+ ray start --address=$DSTACK_MASTER_NODE_IP:6379
+ fi
+
+ # Expose Ray dashboard port
+ ports:
+ - 8265
+
+ resources:
+ gpu: 80GB:8
+ shm_size: 128GB
+
+ # Save checkpoints on the instance
+ volumes:
+ - /checkpoints:/checkpoints
+
+Now, if you run this task via `dstack apply`, it will automatically forward the Ray's dashboard port to `localhost:8265`.
+
+.. code-block:: bash
+
+ dstack apply -f ray-cluster.dstack.yml
+
+As long as the `dstack apply` is attached, you can use `localhost:8265` to submit Ray jobs for execution
+
+Submit Ray jobs
+~~~~~~~~~~~~~~~
+
+Before you can submit Ray jobs, ensure to install `ray` locally:
+
+.. code-block:: shell
+
+ pip install ray
+
+Now you can submit the training job to the Ray cluster which is available at ``localhost:8265``:
+
+.. code-block:: shell
+
+ $ RAY_ADDRESS=http://localhost:8265
+ $ ray job submit \
+ -- python3 -m verl.trainer.main_ppo \
+ data.train_files=/root/data/gsm8k/train.parquet \
+ data.val_files=/root/data/gsm8k/test.parquet \
+ data.train_batch_size=256 \
+ data.max_prompt_length=512 \
+ data.max_response_length=256 \
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
+ actor_rollout_ref.actor.optim.lr=1e-6 \
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+ critic.optim.lr=1e-5 \
+ critic.model.path=Qwen/Qwen2.5-7B-Instruct \
+ critic.ppo_micro_batch_size_per_gpu=4 \
+ algorithm.kl_ctrl.kl_coef=0.001 \
+ trainer.project_name=ppo_training \
+ trainer.experiment_name=qwen-2.5-7B \
+ trainer.val_before_train=False \
+ trainer.n_gpus_per_node=8 \
+ trainer.nnodes=2 \
+ trainer.default_local_dir=/checkpoints \
+ trainer.save_freq=10 \
+ trainer.test_freq=10 \
+ trainer.total_epochs=15 2>&1 | tee verl_demo.log \
+ trainer.resume_mode=disable
+
+
+For more details on how `dstack` works, check out its `documentation `_.
+
+How to debug?
+---------------------
+
+
+Ray Distributed Debugger VSCode Extension (Recommended)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. Starting with Ray 2.39, Anyscale has introduced the `Ray Distributed Debugger `_ VSCode extension. Follow the extension’s installation instructions, then add your cluster using the dashboard URL you obtained earlier.
+
+ .. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/debugger.png?raw=true
+ :alt: Ray Distributed Debugger VSCode extension screenshot
+
+2. Prerequisites.
+
+ Ensure the following are installed (see the extension README for more detail):
+
+ - Visual Studio Code
+ - `ray[default]` >= 2.9.1
+ - `debugpy` >= 1.8.0
+
+ .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/c7098b755ff689859837773a916c857.png?raw=true
+ :alt: VSCode with Ray prerequisites
+
+3. Environment Variables.
+
+ To enable post‑mortem debugging, set:
+
+ .. code-block:: bash
+
+ export RAY_DEBUG_POST_MORTEM=1
+
+ .. admonition:: Note
+ :class: important
+
+ Be sure to remove any legacy flags before starting Ray:
+
+ - `RAY_DEBUG=legacy`
+ - `--ray-debugger-external`
+
+4. Configuring BreakpointsSet up breakpoint() in your code, and submit job to cluster. Then the extension will show the breakpoint information.
+
+
+ 1. Insert `breakpoint()` calls into your remote functions.
+ 2. Submit your job to the cluster.
+
+ The extension will detect active breakpoints and display them in VSCode.
+
+ .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/4ddad74395c79a1402331c0ce73316f.png?raw=true
+ :alt: Detected breakpoint in VSCode
+
+ **Note:** Breakpoints are only supported inside functions decorated with `@ray.remote`.
+
+5. Launching the Debugger.
+
+ Run your job directly from the command line (do not use a `launch.json`):
+
+ .. code-block:: bash
+
+ python job.py
+
+6. Attaching to a Breakpoint.
+
+ Once the process hits the first `breakpoint()`, click the Ray Distributed Debugger icon in the VSCode sidebar to attach the debugger.
+
+ .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/4ddad74395c79a1402331c0ce73316f.png?raw=true
+ :alt: Attaching VSCode debugger to Ray process
+
+7. Debugging With Multiple breakpoint().
+
+ For each subsequent task, first disconnect the current debugger session, then click the extension icon again to attach to the next breakpoint.
+
+ .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/6e83c910a62c82fecb89c6619e001cd.png?raw=true
+ :alt: Disconnecting and reconnecting the debugger
+
+Legacy Ray Debugger
+~~~~~~~~~~~~~~~~~~~
+1. Ray has a builtin legacy `debugger `_ that allows you to debug your distributed applications. To enable debugger, start ray cluster with ``RAY_DEBUG=legacy`` and ``--ray-debugger-external``.
+
+.. code-block:: bash
+
+ # start head node
+ RAY_DEBUG=legacy ray start --head --dashboard-host=0.0.0.0 --ray-debugger-external
+ # start worker node
+ RAY_DEBUG=legacy ray start --address='10.124.46.192:6379' --ray-debugger-external
+
+2. Set up breakpoint in your code, and submit job to cluster. Then run ``ray debug`` to wait breakpoint:
+
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/legacy.png?raw=true
+
+
+Multi-node training on AMD clusters
+---------------------------------------------------------------------------------------
+
+If you want to run multi-node training with slurm with Docker/Podman container on AMD Cluster, you can use the following script.
+
+If you encounter any issues in using AMD GPUs running verl, please contact `Yusheng Su