shahidul034 commited on
Commit
034cb04
·
verified ·
1 Parent(s): e00ff48

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/fsdp_config.json +4 -0
  2. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/generation_config.json +13 -0
  3. code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/tokenizer_config.json +239 -0
  4. code/RL_model/models/RL_model_only_subclaim_test/latest_checkpointed_iteration.txt +1 -0
  5. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/fsdp_config.json +4 -0
  6. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/added_tokens.json +28 -0
  7. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/chat_template.jinja +61 -0
  8. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/config.json +68 -0
  9. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/generation_config.json +13 -0
  10. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/merges.txt +0 -0
  11. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/special_tokens_map.json +31 -0
  12. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/tokenizer_config.json +239 -0
  13. code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/vocab.json +0 -0
  14. code/RL_model/models/RL_model_subclaim_classifier_v1/latest_checkpointed_iteration.txt +1 -0
  15. code/RL_model/verl/Search-R1/misc/docs/experiment_log.md +47 -0
  16. code/RL_model/verl/Search-R1/misc/docs/multinode.md +134 -0
  17. code/RL_model/verl/Search-R1/misc/docs/retriever.md +128 -0
  18. code/RL_model/verl/Search-R1/misc/example/case.txt +43 -0
  19. code/RL_model/verl/Search-R1/misc/example/corpus.jsonl +10 -0
  20. code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_32b.sh +77 -0
  21. code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_72b.sh +75 -0
  22. code/RL_model/verl/Search-R1/misc/example/multinode/train_ppo_multinode_32b.sh +84 -0
  23. code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_ann.sh +12 -0
  24. code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_bm25.sh +10 -0
  25. code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_google.sh +8 -0
  26. code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_hierarchical.sh +17 -0
  27. code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_serpapi.sh +7 -0
  28. code/RL_model/verl/Search-R1/misc/public/status.png +0 -0
  29. code/RL_model/verl/Search-R1/misc/public/worker.png +0 -0
  30. code/RL_model/verl/Search-R1/misc/scripts/data_process/nq.py +100 -0
  31. code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_rag.py +141 -0
  32. code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_search.py +101 -0
  33. code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_test_merge.py +115 -0
  34. code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_train_merge.py +105 -0
  35. code/RL_model/verl/Search-R1/misc/scripts/download.py +25 -0
  36. code/RL_model/verl/Search-R1/misc/scripts/download.sh +6 -0
  37. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/README.md +42 -0
  38. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/data_process.sh +10 -0
  39. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/evaluate.sh +65 -0
  40. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_grpo.sh +84 -0
  41. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_ppo.sh +92 -0
  42. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_grpo.sh +79 -0
  43. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_ppo.sh +88 -0
  44. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_grpo_format.sh +87 -0
  45. code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_ppo_format.sh +94 -0
  46. code/RL_model/verl/Search-R1/misc/scripts/upload.py +12 -0
  47. code/RL_model/verl/Search-R1/misc/scripts/upload.sh +6 -0
  48. code/RL_model/verl/Search-R1/verl/models/README.md +35 -0
  49. code/RL_model/verl/Search-R1/verl/models/__init__.py +13 -0
  50. code/RL_model/verl/Search-R1/verl/models/llama/__init__.py +13 -0
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/fsdp_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "FSDP_version": 1,
3
+ "world_size": 2
4
+ }
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.56.1"
13
+ }
code/RL_model/models/RL_model_only_subclaim_test/global_step_60/actor/huggingface/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 1010000,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
code/RL_model/models/RL_model_only_subclaim_test/latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 60
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/fsdp_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "FSDP_version": 1,
3
+ "world_size": 2
4
+ }
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/chat_template.jinja ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {{- '<|im_start|>' + message.role + '\n' + content }}
27
+ {%- if message.tool_calls %}
28
+ {%- for tool_call in message.tool_calls %}
29
+ {%- if (loop.first and content) or (not loop.first) %}
30
+ {{- '\n' }}
31
+ {%- endif %}
32
+ {%- if tool_call.function %}
33
+ {%- set tool_call = tool_call.function %}
34
+ {%- endif %}
35
+ {{- '<tool_call>\n{"name": "' }}
36
+ {{- tool_call.name }}
37
+ {{- '", "arguments": ' }}
38
+ {%- if tool_call.arguments is string %}
39
+ {{- tool_call.arguments }}
40
+ {%- else %}
41
+ {{- tool_call.arguments | tojson }}
42
+ {%- endif %}
43
+ {{- '}\n</tool_call>' }}
44
+ {%- endfor %}
45
+ {%- endif %}
46
+ {{- '<|im_end|>\n' }}
47
+ {%- elif message.role == "tool" %}
48
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
+ {{- '<|im_start|>user' }}
50
+ {%- endif %}
51
+ {{- '\n<tool_response>\n' }}
52
+ {{- content }}
53
+ {{- '\n</tool_response>' }}
54
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
+ {{- '<|im_end|>\n' }}
56
+ {%- endif %}
57
+ {%- endif %}
58
+ {%- endfor %}
59
+ {%- if add_generation_prompt %}
60
+ {{- '<|im_start|>assistant\n' }}
61
+ {%- endif %}
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "float32",
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 9728,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention"
51
+ ],
52
+ "max_position_embeddings": 262144,
53
+ "max_window_layers": 36,
54
+ "model_type": "qwen3",
55
+ "num_attention_heads": 32,
56
+ "num_hidden_layers": 36,
57
+ "num_key_value_heads": 8,
58
+ "pad_token_id": 151643,
59
+ "rms_norm_eps": 1e-06,
60
+ "rope_scaling": null,
61
+ "rope_theta": 5000000,
62
+ "sliding_window": null,
63
+ "tie_word_embeddings": true,
64
+ "transformers_version": "4.56.1",
65
+ "use_cache": true,
66
+ "use_sliding_window": false,
67
+ "vocab_size": 151936
68
+ }
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.56.1"
13
+ }
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 1010000,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
code/RL_model/models/RL_model_subclaim_classifier_v1/global_step_45/actor/huggingface/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
code/RL_model/models/RL_model_subclaim_classifier_v1/latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 45
code/RL_model/verl/Search-R1/misc/docs/experiment_log.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Experiment log
3
+
4
+ ### Preliminary results
5
+
6
+ Resources: [wandb](https://wandb.ai/peterjin/Search-R1-open)
7
+
8
+
9
+ The preliminary experiment is conducted only on natural question (NQ) dataset (+ PPO) with a small number of training steps.
10
+
11
+
12
+ ### v0.1
13
+
14
+ Resources: [wandb](https://wandb.ai/peterjin/Search-R1-nq_hotpotqa_train), [docs](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa), [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa/v0.1)
15
+
16
+
17
+ We extend the experiments from NQ to seven datasets with both PPO and GRPO methods. The studies are still on a small number of training steps with a big learning rate warm up ratio.
18
+
19
+
20
+ ### v0.2
21
+
22
+ Resources: [wandb](https://wandb.ai/peterjin/Search-R1-v0.2), [docs](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa), [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa/v0.2), [paper](https://arxiv.org/abs/2503.09516)
23
+
24
+
25
+ We fix several bugs including [retrieved token masking](https://github.com/PeterGriffinJin/Search-R1/pull/21) and [GRPO sample indexing](https://github.com/PeterGriffinJin/Search-R1/commit/9ec2fa9892fbf0315d0c67b4dc08ae8f6cf5f378).
26
+ The former can largely improve the stablity of RL training.
27
+ Then we adjust the training scripts, increasing the number of training steps and decreasing the learning rate warm up ratio, to obtain a better performance, and conduct experiments on different scale of LLMs (3B, 7B, 14B).
28
+
29
+
30
+ ### v0.3
31
+
32
+ Resources: [wandb](https://wandb.ai/peterjin/Search-R1-v0.3), [docs](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa), [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/scripts/nq_hotpotqa/v0.3), [paper](https://arxiv.org/abs/2505.15117)
33
+
34
+ We conduct studies on (1) reward design; (2) LLM backbone; and (3) search engine.
35
+
36
+ - Reward design
37
+ - Format reward
38
+ - Intermediate retrieval reward
39
+ - LLM backbone
40
+ - LLM type (e.g., general LLM or reasoning LLM)
41
+ - LLM scale (3B/7B/14B/32B)
42
+ - Search engine
43
+ - RL training dynamics
44
+ - generalization during inference
45
+ - Data scaling
46
+
47
+ Details can be found in the [paper](https://arxiv.org/abs/2505.15117).
code/RL_model/verl/Search-R1/misc/docs/multinode.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Multinode Training
3
+
4
+ Our codebase supports multi-node training for large-scale language models. The implementation is mainly based on [Ray](https://github.com/ray-project/ray).
5
+
6
+ There are two types of nodes when doing Ray multi-node training: (1) head node and (2) worker nodes.
7
+ There is only one head node where you will start the ray cluster and submit the job.
8
+ The other nodes are worker nodes, where you only need to start and register to the ray cluster.
9
+
10
+ ### Step 1: Set up multinode ray cluster (from [link](https://verl.readthedocs.io/en/latest/start/multinode.html#set-up-multinode-ray-cluster))
11
+
12
+ a. Start **head** node with ```ray start --head --dashboard-host=0.0.0.0```, there’re 2 address you should care about:
13
+
14
+ - GCS address: ```ray start --address=<address>```, where **worker** node should connect to.
15
+
16
+ - Dashboard address: ```<address>:8265```, where you should submit job to the cluster.
17
+
18
+ ![head](../public/head.png)
19
+
20
+ b. Start **worker node** and register it to the ray cluster with ```ray start --address=<address>``` you get above.
21
+
22
+ ![worker](../public/worker.png)
23
+
24
+ c. Check the cluster status with ```ray status```.
25
+
26
+ For example, if you have two nodes (each with 8 GPUs) in the cluster, you should see something like this:
27
+
28
+ ![status](../public/status.png)
29
+
30
+
31
+ ### Step 2: Launch the retrieval server on every node.
32
+
33
+ We would recommend launch the **same** retrieval server on every nodes (including both head and worker nodes) for the stable RL training. Detailed information on how to launch different retrievers can be found as follows: [doc](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/retriever.md) and [scripts](https://github.com/PeterGriffinJin/Search-R1/tree/main/example/retriever).
34
+
35
+ For example, if you want to launch the local dense retriever with flat indexing, run the following command on **every** nodes:
36
+
37
+ ```
38
+ bash retrieval_launch.sh
39
+ ```
40
+
41
+
42
+ ### Step 3: Start the job
43
+
44
+ After the retrievers are launched, you can start the training job. You only need to start the job on the ***head*** node.
45
+
46
+ An example script is shown as below. Change ```RAY_DASHBOARD_ADDRESS``` and ```N_NODES``` to your dashboard address found in step 1 and the number of nodes respectively.
47
+
48
+ More script examples can be found [here](https://github.com/PeterGriffinJin/Search-R1/tree/main/example/multinode).
49
+
50
+
51
+ ```bash
52
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
53
+ export DATA_DIR='data/nq_search'
54
+
55
+ WAND_PROJECT="Search-R1-release"
56
+ RAY_DASHBOARD_ADDRESS="<address>:8265"
57
+ N_NODES=2
58
+
59
+ export BASE_MODEL='Qwen/Qwen2.5-7B'
60
+ export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-ppo-qwen2.5-7b-em-multinode-$N_NODES
61
+
62
+ # set -x
63
+ export VLLM_ATTENTION_BACKEND=XFORMERS
64
+
65
+ ulimit -n 65535
66
+
67
+ ray job submit --address=$RAY_DASHBOARD_ADDRESS \
68
+ --runtime-env=verl/trainer/runtime_env.yaml \
69
+ --no-wait \
70
+ -- \
71
+ python3 -m verl.trainer.main_ppo \
72
+ data.train_files=$DATA_DIR/train.parquet \
73
+ data.val_files=$DATA_DIR/test.parquet \
74
+ data.train_data_num=null \
75
+ data.val_data_num=null \
76
+ data.train_batch_size=512 \
77
+ data.val_batch_size=256 \
78
+ data.max_prompt_length=4096 \
79
+ data.max_response_length=500 \
80
+ data.max_start_length=2048 \
81
+ data.max_obs_length=500 \
82
+ data.shuffle_train_dataloader=True \
83
+ algorithm.adv_estimator=gae \
84
+ actor_rollout_ref.model.path=$BASE_MODEL \
85
+ actor_rollout_ref.actor.optim.lr=1e-6 \
86
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
87
+ actor_rollout_ref.model.use_remove_padding=True \
88
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
89
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
90
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
91
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
92
+ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
93
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
94
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
95
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
96
+ actor_rollout_ref.rollout.name=vllm \
97
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
98
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
99
+ actor_rollout_ref.ref.fsdp_config.param_offload=False \
100
+ actor_rollout_ref.rollout.n_agent=1 \
101
+ actor_rollout_ref.rollout.temperature=1 \
102
+ actor_rollout_ref.rollout.top_p=1.0 \
103
+ actor_rollout_ref.actor.state_masking=true \
104
+ critic.optim.lr=1e-5 \
105
+ critic.model.use_remove_padding=True \
106
+ critic.optim.lr_warmup_steps_ratio=0.015 \
107
+ critic.model.path=$BASE_MODEL \
108
+ critic.model.enable_gradient_checkpointing=true \
109
+ critic.ppo_micro_batch_size=16 \
110
+ critic.model.fsdp_config.param_offload=False \
111
+ critic.model.fsdp_config.grad_offload=False \
112
+ critic.model.fsdp_config.optimizer_offload=False \
113
+ algorithm.kl_ctrl.kl_coef=0.001 \
114
+ algorithm.no_think_rl=false \
115
+ trainer.critic_warmup=0 \
116
+ trainer.logger=['wandb'] \
117
+ +trainer.val_only=false \
118
+ +trainer.val_before_train=false \
119
+ trainer.default_hdfs_dir=null \
120
+ trainer.n_gpus_per_node=8 \
121
+ trainer.nnodes=$N_NODES \
122
+ trainer.save_freq=100 \
123
+ trainer.test_freq=100 \
124
+ trainer.project_name=$WAND_PROJECT \
125
+ trainer.experiment_name=$EXPERIMENT_NAME \
126
+ trainer.total_epochs=15 \
127
+ trainer.total_training_steps=1005 \
128
+ trainer.default_hdfs_dir=null \
129
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
130
+ max_turns=4 \
131
+ retriever.url="http://127.0.0.1:8000/retrieve" \
132
+ retriever.topk=3 \
133
+ 2>&1 | tee $EXPERIMENT_NAME.log
134
+ ```
code/RL_model/verl/Search-R1/misc/docs/retriever.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Search Engine
3
+
4
+ In this document, we provide examples of how to launch different retrievers, including local sparse retriever (e.g., BM25), local dense retriever (e.g., e5) and online search engine.
5
+ For local retrievers, we use [wiki-18](https://huggingface.co/datasets/PeterJinGo/wiki-18-corpus) corpus as an example and the corpus indexing can be found at [bm25](https://huggingface.co/datasets/PeterJinGo/wiki-18-bm25-index), [e5-flat](https://huggingface.co/datasets/PeterJinGo/wiki-18-e5-index), [e5-HNSW64](https://huggingface.co/datasets/PeterJinGo/wiki-18-e5-index-HNSW64).
6
+
7
+ ### How to choose the retriever?
8
+
9
+ - If you have a private or domain-specific corpus, choose **local retriever**.
10
+
11
+ - If there is no high quality embedding-based retrievers (dense retrievers) in your domain, choose **sparse local retriever** (e.g., BM25).
12
+
13
+ - Otherwise choose **dense local retriever**.
14
+
15
+ - If you do not have sufficent GPUs to conduct exact dense embedding matching, choose **ANN indexing** on CPUs.
16
+
17
+ - If you have sufficient GPUs, choose **flat indexing** on GPUs.
18
+
19
+
20
+ - If you want to train a general LLM search agent and have enough funding, choose **online search engine** (e.g., [SerpAPI](https://serpapi.com/)).
21
+
22
+
23
+ - If you have a domain specific online search engine (e.g., PubMed search), you can refer to [link](https://github.com/PeterGriffinJin/Search-R1/blob/main/search_r1/search/serp_search_server.py) to integrate it to Search-R1 by yourself.
24
+
25
+ Search engine launching scripts can be found at [link](https://github.com/PeterGriffinJin/Search-R1/tree/main/example/retriever).
26
+
27
+ ### Local Sparse Retriever
28
+
29
+ Sparse retriever (e.g., bm25) is a traditional method. The retrieval process is very efficient and no GPUs are needed. However, it may not be as accurate as dense retrievers in some specific domain.
30
+
31
+ (1) Download the indexing.
32
+ ```bash
33
+ save_path=/your/path/to/save
34
+ huggingface-cli download PeterJinGo/wiki-18-bm25-index --repo-type dataset --local-dir $save_path
35
+ ```
36
+
37
+ (2) Launch a local BM25 retriever server.
38
+ ```bash
39
+ conda activate retriever
40
+
41
+ index_file=$save_path/bm25
42
+ corpus_file=$save_path/wiki-18.jsonl
43
+ retriever_name=bm25
44
+
45
+ python search_r1/search/retrieval_server.py --index_path $index_file --corpus_path $corpus_file --topk 3 --retriever_name $retriever_name
46
+ ```
47
+
48
+
49
+ ### Local Dense Retriever
50
+
51
+ You can also adopt some off-the-shelf dense retrievers, e.g., e5. These models are much stronger than sparse retriever in some specific domains.
52
+ If you have sufficient GPU, we would recommend the flat indexing variant below, otherwise you can adopt the ANN variant.
53
+
54
+ #### Flat indexing
55
+
56
+ Flat indexing conducts exact embedding match, which is slow but very accurate. To make it efficient enough to support online RL, we would recommend enable **GPU** usage by ```--faiss_gpu```.
57
+
58
+ (1) Download the indexing and corpus.
59
+ ```bash
60
+ save_path=/the/path/to/save
61
+ python scripts/download.py --save_path $save_path
62
+ cat $save_path/part_* > $save_path/e5_Flat.index
63
+ gzip -d $save_path/wiki-18.jsonl.gz
64
+ ```
65
+
66
+ (2) Launch a local flat e5 retriever server.
67
+
68
+ ```bash
69
+ conda activate retriever
70
+
71
+ index_file=$save_path/e5_Flat.index
72
+ corpus_file=$save_path/wiki-18.jsonl
73
+ retriever_name=e5
74
+ retriever_path=intfloat/e5-base-v2
75
+
76
+ python search_r1/search/retrieval_server.py --index_path $index_file --corpus_path $corpus_file --topk 3 --retriever_name $retriever_name --retriever_model $retriever_path --faiss_gpu
77
+
78
+ ```
79
+
80
+
81
+ #### ANN indexing (HNSW64)
82
+
83
+ To improve the search efficient with only **CPU**, you can adopt approximate nearest neighbor (ANN) indexing, e.g., with HNSW64.
84
+ It is very efficient, but may not be as accurate as flat indexing, especially when the number of retrieved passages is small.
85
+
86
+ (1) Download the indexing.
87
+ ```bash
88
+ save_path=/the/path/to/save
89
+ huggingface-cli download PeterJinGo/wiki-18-e5-index-HNSW64 --repo-type dataset --local-dir $save_path
90
+ cat $save_path/part_* > $save_path/e5_HNSW64.index
91
+ ```
92
+
93
+
94
+ (2) Launch a local ANN dense retriever server.
95
+ ```bash
96
+ conda activate retriever
97
+
98
+ index_file=$save_path/e5_HNSW64.index
99
+ corpus_file=$save_path/wiki-18.jsonl
100
+ retriever_name=e5
101
+ retriever_path=intfloat/e5-base-v2
102
+
103
+ python search_r1/search/retrieval_server.py --index_path $index_file --corpus_path $corpus_file --topk 3 --retriever_name $retriever_name --retriever_model $retriever_path
104
+ ```
105
+
106
+
107
+ ### Online Search Engine
108
+
109
+ We support both [Google Search API](https://developers.google.com/custom-search/v1/overview) and [SerpAPI](https://serpapi.com/). We would recommend [SerpAPI](https://serpapi.com/) since it integrates multiple online search engine APIs (including Google, Bing, Baidu, etc) and does not have a monthly quota limitation ([Google Search API](https://developers.google.com/custom-search/v1/overview) has a hard 10k monthly quota, which is not sufficient to fulfill online LLM RL training).
110
+
111
+ #### SerAPI online search server
112
+
113
+ ```bash
114
+ search_url=https://serpapi.com/search
115
+ serp_api_key="" # put your serp api key here (https://serpapi.com/)
116
+
117
+ python search_r1/search/serp_search_server.py --search_url $search_url --topk 3 --serp_api_key $serp_api_key
118
+ ```
119
+
120
+ #### Google online search server
121
+
122
+ ```bash
123
+ api_key="" # put your google custom API key here (https://developers.google.com/custom-search/v1/overview)
124
+ cse_id="" # put your google cse API key here (https://developers.google.com/custom-search/v1/overview)
125
+
126
+ python search_r1/search/google_search_server.py --api_key $api_key --topk 5 --cse_id $cse_id --snippet_only
127
+ ```
128
+
code/RL_model/verl/Search-R1/misc/example/case.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ <|im_start|>user
4
+ Answer the given question. You must conduct reasoning inside <think> and </think> first every time you get new information. After reasoning, if you find you lack some knowledge, you can call a search engine by <search> query </search> and it will return the top searched results between <information> and </information>. You can search as many times as your want. If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>. Question: paul walkers cars in fast and furious list?
5
+ <|im_end|>
6
+ <|im_start|>assistant
7
+ <think>I need to search for Paul Walker's cars in Fast and Furious movies.</think>
8
+ <search> Paul Walker's cars in Fast and Furious </search>
9
+
10
+ <information>Doc 1(Title: "Paul Walker") also had a large car collection of about 30 cars, a portion of which he co-owned along with Roger Rodas. The collection included (but is not limited to): Roger Rodas became Walker's financial adviser in 2007 and helped to establish Reach Out Worldwide. Rodas, a pro-am racer was the CEO of Always Evolving, a Valencia high-end vehicle performance shop owned by Walker. Walker was also a close friend of his ""2 Fast 2 Furious"" co-star Tyrese Gibson. Vin Diesel considered Walker to be like a brother, both on and off screen, and affectionately called him ""Pablo"". Walker's mother referred to
11
+ Doc 2(Title: "Paul Walker") Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and ""Varsity Blues"". In 2001, he gained international fame for his role in the street racing action film ""The Fast and the Furious"" (2001), a role he reprised in five of the next six installments, but died in 2013 in the middle of filming ""Furious 7"" (2015). Walker began his career guest-starring on
12
+ Doc 3(Title: "Paul Walker") of Porsche in a separate lawsuit filed by Roger Rodas' widow, Kristine. The ruling had no bearing on two other cases against Porsche which have been filed by Walker's father, who is also the executor of the actor's estate, and his daughter. Walker's father and daughter both reached an agreement with Porsche. Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and</information>
13
+
14
+ From the information provided, it's clear that Paul Walker was a part of the "Fast and Furious" series, but the specific list of cars is not mentioned. Since I lack this particular detail, I will call a search engine to get the specific list of cars Paul Walker drove in the "Fast and Furious" movies.
15
+
16
+ <search> list of cars paul walker drove in Fast and Furious </search>
17
+
18
+ <information>Doc 1(Title: "Paul Walker") also had a large car collection of about 30 cars, a portion of which he co-owned along with Roger Rodas. The collection included (but is not limited to): Roger Rodas became Walker's financial adviser in 2007 and helped to establish Reach Out Worldwide. Rodas, a pro-am racer was the CEO of Always Evolving, a Valencia high-end vehicle performance shop owned by Walker. Walker was also a close friend of his ""2 Fast 2 Furious"" co-star Tyrese Gibson. Vin Diesel considered Walker to be like a brother, both on and off screen, and affectionately called him ""Pablo"". Walker's mother referred to
19
+ Doc 2(Title: "Paul Walker") Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and ""Varsity Blues"". In 2001, he gained international fame for his role in the street racing action film ""The Fast and the Furious"" (2001), a role he reprised in five of the next six installments, but died in 2013 in the middle of filming ""Furious 7"" (2015). Walker began his career guest-starring on
20
+ Doc 3(Title: "Paul Walker") of Porsche in a separate lawsuit filed by Roger Rodas' widow, Kristine. The ruling had no bearing on two other cases against Porsche which have been filed by Walker's father, who is also the executor of the actor's estate, and his daughter. Walker's father and daughter both reached an agreement with Porsche. Paul Walker Paul William Walker IV (September 12, 1973 – November 30, 2013) was an American actor best known for his role as Brian O'Conner in ""The Fast and the Furious"" franchise. Walker first gained prominence in 1999 with roles in the teen films ""She's All That"" and</information>
21
+
22
+ Based on the updated information, it seems the focus was on his career, financials, and family. However, I am still missing the specific list of cars he drove in the "Fast and Furious" movies. Since it appears that the information might not be contained within the accessed documents, and I have no further search queries to make, I will provide an answer based on the details I have.
23
+
24
+ <answer> Charger </answer>
25
+
26
+
27
+
28
+ <|im_start|>system
29
+ You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
30
+ <|im_start|>user
31
+ Answer the given question. You must conduct reasoning inside <think> and </think> first every time you get new information. After reasoning, if you find you lack some knowledge, you can call a search engine by <search> query </search> and it will return the top searched results between <information> and </information>. You can search as many times as your want. If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>. Question: the heart muscle is stimulated to contract by electrical impulses which are generated where?
32
+ <|im_end|>
33
+ <|im_start|>assistant
34
+ <think> I remember that the heart's electrical impulses originate from a specific part of the heart, but I need to confirm which part.</think>
35
+ <search> origin of heart's electrical impulses </search>
36
+
37
+ <information>Doc 1(Title: "Cardiac pacemaker") Cardiac pacemaker The contraction of cardiac muscle (heart muscle) in all animals is initiated by electrical impulses known as action potentials. The rate at which these impulses fire controls the rate of cardiac contraction, that is, the heart rate. The cells that create these rhythmic impulses, setting the pace for blood pumping, are called pacemaker cells, and they directly control the heart rate. They make up the cardiac pacemaker, that is, the natural pacemaker of the heart. In most humans, the concentration of pacemaker cells in the sinoatrial (SA) node is the natural pacemaker, and the resultant rhythm is a
38
+ Doc 2(Title: "Electrical conduction system of the heart") Electrical conduction system of the heart The electrical conduction system of the heart transmits signals generated usually by the sinoatrial node to cause contraction of the heart muscle. The pacemaking signal generated in the sinoatrial node travels through the right atrium to the atrioventricular node, along the Bundle of His and through bundle branches to cause contraction of the heart muscle. This signal stimulates contraction first of the right and left atrium, and then the right and left ventricles. This process allows blood to be pumped throughout the body. The conduction system consists of specialised heart muscle cells, and is
39
+ Doc 3(Title: "Sinoatrial node") Sinoatrial node The sinoatrial node (SA node), also known as sinus node, is a group of cells located in the wall of the right atrium of the heart. These cells have the ability to spontaneously produce an electrical impulse (action potential; see below for more details), that travels through the heart via the electrical conduction system (see figure 1) causing it to contract. In a healthy heart, the SA node continuously produces action potential, setting the rhythm of the heart and so is known as the heart's natural pacemaker. The rate of action potential production (and therefore the heart rate)</information>
40
+
41
+ From the information provided, it is clear that the heart's electrical impulses are generated in the sinoatrial (SA) node, which is often referred to as the heart's natural pacemaker. </think>
42
+
43
+ <answer> sinoatrial (SA) node </answer>
code/RL_model/verl/Search-R1/misc/example/corpus.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"id": "0", "contents": "\"Evan Morris\"\nEvan Morris Evan L. Morris (January 26, 1977 \u2013 July 9, 2015) was a lobbyist for Genentech and its parent corporation Roche in Washington, D.C. He began his career in Washington as an intern in the Clinton White House at age 18. He began his lobbying work at Patton Boggs before moving on to Roche in 2005. His early work at Roche involved government policy on Medicare and Medicaid, flu preparedness, and hepatitis C. His work at Genentech is being investigated by the Wall Street Journal, which states that \u201cshaping up to be one of the biggest U.S. investigations into"}
2
+ {"id": "1", "contents": "\"Horatio Hale\"\nconsisted of an Algonkin vocabulary, which he gathered from a band of Indians who had camped on the college grounds. Three years later, when the United States Exploring Expedition to little-known portions of the globe was organised under Charles Wilkes, Hale was recommended, while yet an undergraduate, for the post of ethnologist and philologist, and obtained the appointment. From 1838 to 1842, he was employed in the work of the expedition, visiting South America, Australasia, Polynesia, and North-western America, then known as Oregon. From this point he returned overland. The Hale Passages of Puget Sound were named in recognition of"}
3
+ {"id": "2", "contents": "\"Dibba Al-Hisn\"\nis believed to be the site where the Portuguese during the Habsburg Dynasty built a fort and a wall around the city. In August 1648, the Arabs besieged Muscat, Oman, and on October 31, 1648, a treaty was signed between the two opponents. The terms required the Portuguese to build the fortress of Kuriyat, Dibba Al-Hisn and Matrah (Oman). Note: There have been land disputes between Dibba Al-Hisn and Dibba Al-Baya, but these were resolved in the 1990s. </div> A branch of the Community College of the University of Sharjah is available in Dibba Al-Hisn. Alkhalidya Secondary School Dibba Al-Hisn"}
4
+ {"id": "3", "contents": "\"Ao Oni (film)\"\nthe door as a big blue hand grabs his head. The group hears Takeshi's scream and come back to check on him. As they reach the room, they see the door open and Takeshi, bleeding, is thrown out. He asks for help before being dragged back and the door closes. As the group enters the room, they finds Takeshi's dismembered body. Anna and Hiroshi pressure Mika about Takuro's whereabouts and about Naoki, but Mika refuses to tell them and runs away, chased by Hiroshi. Anna confronts Shun who describes that this event is the same as the game he created."}
5
+ {"id": "4", "contents": "\"Pavia Cathedral\"\non March 17, 1989. The cathedral was begun in 1488, under architect Cristoforo Rocchi, who was soon replaced by Giovanni Antonio Amadeo and Gian Giacomo Dolcebuono. The original project, with a nave and two aisles flanked by semicircular niches and a large central dome, was influenced by Bramante, some details of it later appearing in St. Peter's Basilica in Rome. Leonardo da Vinci is also known to have contributed to the project. In 1521, the altar area was completed by Gianpietrino Rizzi, a pupil of Da Vinci. By the 17th century, the presbytery had been completed but only in the"}
6
+ {"id": "5", "contents": "\"Pavia Cathedral\"\nfollowing century was the tambour built, while the dome itself and the facade had to wait for the 19th century. The dome was designed by Carlo Maciachini and completed in 1885, but partially collapsed the same year. In 1930, construction continued with the two arms of the transept, for which the original plan was followed, although using reinforced concrete (in order to save the remains of the medieval Santa Maria del Popolo). The arms are still missing part of the internal marble decoration. The church is on the Greek Cross plan: it therefore has the same length and width at"}
7
+ {"id": "6", "contents": "\"Iowa Highway 17\"\nWesley. Although Iowa 17 has only been designated since 1969, most of the route has been a part of the primary highway system since the system's inception in 1919. The route was designated Primary Road No. 60 and connected Des Moines and Goldfield via Webster City. By 1930, the only paved section of what was by then called Iowa 60 was the southernmost from Des Moines to south of Madrid. In two years, paving extended to US 30 east of Boone. 1932 saw many changes to Iowa 60. The route was extended to the north where it ended at US"}
8
+ {"id": "7", "contents": "\"E.T. the Extra-Terrestrial (video game)\"\nfinancial failure for Atari. By 2004, the cartridges were still very common and offered at very low prices. While reviews of the movie were highly positive, the game was negatively received by critics, with common complaints focused on the plot, gameplay, and visuals. \"\"New York\"\" magazine's Nicholas Pileggi described it as a loser when compared to other games Atari could have released like \"\"Donkey Kong\"\" and \"\"Frogger\"\". \"\"Video Games\"\" called the game \"\"really for kids (the littler ones)\"\". Kevin Bowen of GameSpy's Classic Gaming called the gameplay \"\"convoluted and inane\"\", also criticizing its story for departing from the serious tone"}
9
+ {"id": "8", "contents": "\"Ao Oni (film)\"\n(for Takeshi cannot see Shun), and why Anna said Mika will not call him, because he is already dead and cannot be seen by everyone, except Anna. He then flashbacks to the time Takuro bullied him in the riverbank. Takuro hit Shun in the head, killing him. It is then revealed that Takuro went to the empty house to hide Shun's body. Takuro then explained that Shun used Takuro's name in the game and got mad. Shun is terrified with the truth and finally disappears as Anna apologizes to him. Takuro plans on killing Anna since she knows too much,"}
10
+ {"id": "9", "contents": "\"4th Airborne Corps (Soviet Union)\"\n4th Airborne Corps (Soviet Union) The 4th Airborne Corps was an airborne corps of the Red Army in World War II. It fought in the Vyazma airborne operation, an unsuccessful landing during the Rzhev-Vyazma Offensive. The corps was formed in the spring of 1941 in the Western Special Military District from the personnel of the 214th Airborne Brigade. The corps was commanded by Aleksey Semenovich Zhadov. On 22 June 1941, the corps was stationed in the Western Front's second echelon in Pukhavichy in Minsk Region. On 26 June, the corps was ordered to conduct an air-assault and ground attack with"}
code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_32b.sh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+ RAY_DASHBOARD_ADDRESS="http://xx.xx.xx.xx:8265" # your head node address
8
+ N_NODES=4
9
+
10
+ export BASE_MODEL='Qwen/Qwen2.5-32B'
11
+ export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-grpo-qwen2.5-32b-em-multinode-${N_NODES}
12
+
13
+ # set -x
14
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
15
+
16
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
17
+
18
+ ulimit -n 65535
19
+
20
+ ray job submit --address=$RAY_DASHBOARD_ADDRESS \
21
+ --runtime-env=verl/trainer/runtime_env.yaml \
22
+ --no-wait \
23
+ -- \
24
+ python3 -m verl.trainer.main_ppo \
25
+ data.train_files=$DATA_DIR/train.parquet \
26
+ data.val_files=$DATA_DIR/test.parquet \
27
+ data.train_data_num=null \
28
+ data.val_data_num=null \
29
+ data.train_batch_size=512 \
30
+ data.val_batch_size=256 \
31
+ data.max_prompt_length=4096 \
32
+ data.max_response_length=500 \
33
+ data.max_start_length=2048 \
34
+ data.max_obs_length=500 \
35
+ data.shuffle_train_dataloader=True \
36
+ algorithm.adv_estimator=grpo \
37
+ actor_rollout_ref.model.path=$BASE_MODEL \
38
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
39
+ actor_rollout_ref.model.use_remove_padding=True \
40
+ actor_rollout_ref.actor.optim.lr=2e-7 \
41
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
42
+ actor_rollout_ref.actor.use_kl_loss=True \
43
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
44
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
45
+ actor_rollout_ref.actor.fsdp_config.param_offload=false \
46
+ actor_rollout_ref.actor.fsdp_config.grad_offload=false \
47
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=false \
48
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
49
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
50
+ actor_rollout_ref.rollout.name=vllm \
51
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
52
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
53
+ actor_rollout_ref.ref.fsdp_config.param_offload=false \
54
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
55
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
56
+ algorithm.no_think_rl=false \
57
+ actor_rollout_ref.rollout.n_agent=5 \
58
+ actor_rollout_ref.rollout.temperature=1 \
59
+ actor_rollout_ref.actor.state_masking=True \
60
+ trainer.logger=['wandb'] \
61
+ +trainer.val_only=false \
62
+ +trainer.val_before_train=false \
63
+ trainer.default_hdfs_dir=null \
64
+ trainer.n_gpus_per_node=8 \
65
+ trainer.nnodes=$N_NODES \
66
+ trainer.save_freq=100 \
67
+ trainer.test_freq=100 \
68
+ trainer.project_name=$WAND_PROJECT \
69
+ trainer.experiment_name=$EXPERIMENT_NAME \
70
+ trainer.total_epochs=15 \
71
+ trainer.total_training_steps=1005 \
72
+ trainer.default_hdfs_dir=null \
73
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
74
+ max_turns=4 \
75
+ retriever.url="http://127.0.0.1:8000/retrieve" \
76
+ retriever.topk=3 \
77
+ 2>&1 | tee $EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/example/multinode/train_grpo_multinode_72b.sh ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+ RAY_DASHBOARD_ADDRESS="http://xx.xx.xx.xx:8265" # your head node address
8
+ N_NODES=4
9
+
10
+ export BASE_MODEL='Qwen/Qwen2.5-72B'
11
+ export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-grpo-qwen2.5-72b-em-multinode-${N_NODES}
12
+
13
+ # set -x
14
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
15
+
16
+ ulimit -n 65535
17
+
18
+ ray job submit --address=$RAY_DASHBOARD_ADDRESS \
19
+ --runtime-env=verl/trainer/runtime_env.yaml \
20
+ --no-wait \
21
+ -- \
22
+ python3 -m verl.trainer.main_ppo \
23
+ data.train_files=$DATA_DIR/train.parquet \
24
+ data.val_files=$DATA_DIR/test.parquet \
25
+ data.train_data_num=null \
26
+ data.val_data_num=null \
27
+ data.train_batch_size=512 \
28
+ data.val_batch_size=256 \
29
+ data.max_prompt_length=4096 \
30
+ data.max_response_length=500 \
31
+ data.max_start_length=2048 \
32
+ data.max_obs_length=500 \
33
+ data.shuffle_train_dataloader=True \
34
+ algorithm.adv_estimator=grpo \
35
+ actor_rollout_ref.model.path=$BASE_MODEL \
36
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
37
+ actor_rollout_ref.model.use_remove_padding=True \
38
+ actor_rollout_ref.actor.optim.lr=1e-7 \
39
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
40
+ actor_rollout_ref.actor.use_kl_loss=True \
41
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
42
+ actor_rollout_ref.actor.ppo_micro_batch_size=32 \
43
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
44
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
45
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
46
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
47
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
48
+ actor_rollout_ref.rollout.name=vllm \
49
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
50
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
51
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
52
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
53
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
54
+ algorithm.no_think_rl=false \
55
+ actor_rollout_ref.rollout.n_agent=5 \
56
+ actor_rollout_ref.rollout.temperature=1 \
57
+ actor_rollout_ref.actor.state_masking=True \
58
+ trainer.logger=['wandb'] \
59
+ +trainer.val_only=false \
60
+ +trainer.val_before_train=false \
61
+ trainer.default_hdfs_dir=null \
62
+ trainer.n_gpus_per_node=8 \
63
+ trainer.nnodes=$N_NODES \
64
+ trainer.save_freq=100 \
65
+ trainer.test_freq=100 \
66
+ trainer.project_name=$WAND_PROJECT \
67
+ trainer.experiment_name=$EXPERIMENT_NAME \
68
+ trainer.total_epochs=15 \
69
+ trainer.total_training_steps=1005 \
70
+ trainer.default_hdfs_dir=null \
71
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
72
+ max_turns=4 \
73
+ retriever.url="http://127.0.0.1:8000/retrieve" \
74
+ retriever.topk=3 \
75
+ 2>&1 | tee $EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/example/multinode/train_ppo_multinode_32b.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+ RAY_DASHBOARD_ADDRESS="http://xx.xx.xx.xx:8265" # your head node address
8
+ N_NODES=4
9
+
10
+ export BASE_MODEL='Qwen/Qwen2.5-32B'
11
+ export EXPERIMENT_NAME=${train_data}-${test_data}-search-r1-ppo-qwen2.5-32b-em-multinode-${N_NODES}
12
+
13
+ # set -x
14
+ export VLLM_ATTENTION_BACKEND=XFORMERS
15
+
16
+ ulimit -n 65535
17
+
18
+ ray job submit --address=$RAY_DASHBOARD_ADDRESS \
19
+ --runtime-env=verl/trainer/runtime_env.yaml \
20
+ --no-wait \
21
+ -- \
22
+ python3 -m verl.trainer.main_ppo \
23
+ data.train_files=$DATA_DIR/train.parquet \
24
+ data.val_files=$DATA_DIR/test.parquet \
25
+ data.train_data_num=null \
26
+ data.val_data_num=null \
27
+ data.train_batch_size=512 \
28
+ data.val_batch_size=256 \
29
+ data.max_prompt_length=4096 \
30
+ data.max_response_length=500 \
31
+ data.max_start_length=2048 \
32
+ data.max_obs_length=500 \
33
+ data.shuffle_train_dataloader=True \
34
+ algorithm.adv_estimator=gae \
35
+ actor_rollout_ref.model.path=$BASE_MODEL \
36
+ actor_rollout_ref.actor.optim.lr=2e-7 \
37
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
38
+ actor_rollout_ref.model.use_remove_padding=True \
39
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
40
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
41
+ actor_rollout_ref.actor.ppo_micro_batch_size=32 \
42
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
43
+ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
44
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
45
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
46
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
47
+ actor_rollout_ref.rollout.name=vllm \
48
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
49
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
50
+ actor_rollout_ref.ref.fsdp_config.param_offload=False \
51
+ actor_rollout_ref.rollout.n_agent=1 \
52
+ actor_rollout_ref.rollout.temperature=1 \
53
+ actor_rollout_ref.rollout.top_p=1.0 \
54
+ actor_rollout_ref.actor.state_masking=true \
55
+ critic.optim.lr=1e-5 \
56
+ critic.model.use_remove_padding=True \
57
+ critic.optim.lr_warmup_steps_ratio=0.015 \
58
+ critic.model.path=$BASE_MODEL \
59
+ critic.model.enable_gradient_checkpointing=true \
60
+ critic.ppo_micro_batch_size=32 \
61
+ critic.model.fsdp_config.param_offload=False \
62
+ critic.model.fsdp_config.grad_offload=False \
63
+ critic.model.fsdp_config.optimizer_offload=True \
64
+ algorithm.kl_ctrl.kl_coef=0.001 \
65
+ algorithm.no_think_rl=false \
66
+ trainer.critic_warmup=0 \
67
+ trainer.logger=['wandb'] \
68
+ +trainer.val_only=false \
69
+ +trainer.val_before_train=true \
70
+ trainer.default_hdfs_dir=null \
71
+ trainer.n_gpus_per_node=8 \
72
+ trainer.nnodes=$N_NODES \
73
+ trainer.save_freq=100 \
74
+ trainer.test_freq=100 \
75
+ trainer.project_name=$WAND_PROJECT \
76
+ trainer.experiment_name=$EXPERIMENT_NAME \
77
+ trainer.total_epochs=15 \
78
+ trainer.total_training_steps=1005 \
79
+ trainer.default_hdfs_dir=null \
80
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
81
+ max_turns=4 \
82
+ retriever.url="http://127.0.0.1:8000/retrieve" \
83
+ retriever.topk=3 \
84
+ 2>&1 | tee $EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_ann.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ file_path=/the/path/you/save/corpus
3
+ index_file=$file_path/e5_HNSW64.index
4
+ corpus_file=$file_path/wiki-18.jsonl
5
+ retriever_name=e5
6
+ retriever_path=intfloat/e5-base-v2
7
+
8
+ python search_r1/search/retrieval_server.py --index_path $index_file \
9
+ --corpus_path $corpus_file \
10
+ --topk 3 \
11
+ --retriever_name $retriever_name \
12
+ --retriever_model $retriever_path
code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_bm25.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ file_path=/the/path/you/save/corpus
3
+ index_file=$file_path/bm25
4
+ corpus_file=$file_path/wiki-18.jsonl
5
+ retriever_name=bm25
6
+
7
+ python search_r1/search/retrieval_server.py --index_path $index_file \
8
+ --corpus_path $corpus_file \
9
+ --topk 3 \
10
+ --retriever_name $retriever_name
code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_google.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ api_key="" # put your google custom API key here (https://developers.google.com/custom-search/v1/overview)
3
+ cse_id="" # put your google cse API key here (https://developers.google.com/custom-search/v1/overview)
4
+
5
+ python search_r1/search/internal_google_server.py --api_key $api_key \
6
+ --topk 5 \
7
+ --cse_id $cse_id \
8
+ --snippet_only
code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_hierarchical.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ file_path=/the/path/you/save/corpus
3
+ index_file=$file_path/e5_Flat.index
4
+ corpus_file=$file_path/wiki-18.jsonl
5
+ retriever_name=e5
6
+ retriever_path=intfloat/e5-base-v2
7
+ reranker_path=cross-encoder/ms-marco-MiniLM-L12-v2
8
+
9
+ python search_r1/search/retrieval_rerank_server.py --index_path $index_file \
10
+ --corpus_path $corpus_file \
11
+ --retrieval_topk 10 \
12
+ --retriever_name $retriever_name \
13
+ --retriever_model $retriever_path \
14
+ --faiss_gpu \
15
+ --reranking_topk 3 \
16
+ --reranker_model $reranker_path \
17
+ --reranker_batch_size 32
code/RL_model/verl/Search-R1/misc/example/retriever/retrieval_launch_serpapi.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ search_url=https://serpapi.com/search
3
+ serp_api_key="" # put your serp api key here (https://serpapi.com/)
4
+
5
+ python search_r1/search/online_search_server.py --search_url $search_url \
6
+ --topk 3 \
7
+ --serp_api_key $serp_api_key
code/RL_model/verl/Search-R1/misc/public/status.png ADDED
code/RL_model/verl/Search-R1/misc/public/worker.png ADDED
code/RL_model/verl/Search-R1/misc/scripts/data_process/nq.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the nq dataset to parquet format
16
+ """
17
+
18
+ import re
19
+ import os
20
+ import datasets
21
+
22
+ from verl.utils.hdfs_io import copy, makedirs
23
+ import argparse
24
+
25
+
26
+ def make_prefix(dp, template_type):
27
+ question = dp['question']
28
+
29
+ # NOTE: also need to change reward_score/countdown.py
30
+ if template_type == 'base':
31
+ """This works for any base model"""
32
+ prefix = f"""Answer the given question. \
33
+ You should first have a reasoning process in mind and then provides the answer. \
34
+ Show your reasoning in <think> </think> tags and return the final answer in <answer> </answer> tags, for example <answer> Beijing </answer>. \
35
+ Question: {question}\n"""
36
+ else:
37
+ raise NotImplementedError
38
+ return prefix
39
+
40
+
41
+ if __name__ == '__main__':
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument('--local_dir', default='./data/nq')
44
+ parser.add_argument('--hdfs_dir', default=None)
45
+ parser.add_argument('--template_type', type=str, default='base')
46
+
47
+ args = parser.parse_args()
48
+
49
+ data_source = 'nq'
50
+
51
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq')
52
+
53
+ train_dataset = dataset['train']
54
+ test_dataset = dataset['test']
55
+
56
+ # add a row to each data item that represents a unique id
57
+ def make_map_fn(split):
58
+
59
+ def process_fn(example, idx):
60
+ example['question'] = example['question'].strip()
61
+ if example['question'][-1] != '?':
62
+ example['question'] += '?'
63
+ question = make_prefix(example, template_type=args.template_type)
64
+ solution = {
65
+ "target": example['golden_answers'],
66
+ }
67
+
68
+ data = {
69
+ "data_source": data_source,
70
+ "prompt": [{
71
+ "role": "user",
72
+ "content": question,
73
+ }],
74
+ "ability": "fact-reasoning",
75
+ "reward_model": {
76
+ "style": "rule",
77
+ "ground_truth": solution
78
+ },
79
+ "extra_info": {
80
+ 'split': split,
81
+ 'index': idx,
82
+ }
83
+ }
84
+ return data
85
+
86
+ return process_fn
87
+
88
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
89
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
90
+
91
+ local_dir = args.local_dir
92
+ hdfs_dir = args.hdfs_dir
93
+
94
+ train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
95
+ test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
96
+
97
+ if hdfs_dir is not None:
98
+ makedirs(hdfs_dir)
99
+
100
+ copy(src=local_dir, dst=hdfs_dir)
code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_rag.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the nq dataset to parquet format
16
+ """
17
+
18
+ import re
19
+ import os
20
+ import json
21
+ import datasets
22
+
23
+ from verl.utils.hdfs_io import copy, makedirs
24
+ import argparse
25
+
26
+
27
+ def make_prefix(dp, template_type):
28
+ question = dp['question']
29
+ context = dp['context']
30
+
31
+ # NOTE: also need to change reward_score/countdown.py
32
+ if template_type == 'base':
33
+ """This works for any base model"""
34
+ prefix = f"""Answer the given question with some potentially useful context. \
35
+ You should analyze the question carefully, evaluate the given context (which may or may not be useful), and then generate an accurate and well-reasoned response. \
36
+ You should first have a reasoning process in mind and then provides the answer. \
37
+ Show your reasoning in <think> </think> tags and return the final answer in <answer> </answer> tags, for example <answer> Beijing </answer>. \
38
+ Question: {question} Context: {context} \n"""
39
+ else:
40
+ raise NotImplementedError
41
+ return prefix
42
+
43
+
44
+ def format_reference(retrieval_result):
45
+ format_reference = ''
46
+ for idx, doc_item in enumerate(retrieval_result):
47
+ content = doc_item['contents']
48
+ title = content.split("\n")[0]
49
+ text = "\n".join(content.split("\n")[1:])
50
+ format_reference += f"Doc {idx+1}(Title: {title}) {text}\n"
51
+
52
+ return format_reference
53
+
54
+
55
+ if __name__ == '__main__':
56
+ parser = argparse.ArgumentParser()
57
+ parser.add_argument('--local_dir', default='./data/nq_rag')
58
+ parser.add_argument('--hdfs_dir', default=None)
59
+ parser.add_argument('--template_type', type=str, default='base')
60
+ parser.add_argument('--topk', type=int, default=3)
61
+ parser.add_argument('--corpus_path', type=str, default='/home/peterjin/mnt/data/retrieval-corpus/wiki-18.jsonl')
62
+ parser.add_argument('--train_retrieval_cache', type=str, default='/home/peterjin/rag_retrieval_cache/nq/e5_train_retrieval_cache_2048.json')
63
+ parser.add_argument('--test_retrieval_cache', type=str, default='/home/peterjin/rag_retrieval_cache/nq/e5_test_retrieval_cache_10000.json')
64
+
65
+ args = parser.parse_args()
66
+
67
+ data_source = 'nq'
68
+
69
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq')
70
+
71
+ train_dataset = dataset['train']
72
+ test_dataset = dataset['test']
73
+
74
+ # read retrieval cache
75
+ print('reading retrieval cache...')
76
+ retrieval_cache = json.load(open(args.train_retrieval_cache))
77
+ # test_retrieval_cache = json.load(open(args.test_retrieval_cache))
78
+ retrieval_cache.update(json.load(open(args.test_retrieval_cache)))
79
+
80
+ # read corpus
81
+ print('reading corpus...')
82
+ corpus = {}
83
+ with open(args.corpus_path) as f:
84
+ readin = f.readlines()
85
+ for line in readin:
86
+ tmp = json.loads(line)
87
+ corpus[tmp['id']] = tmp
88
+
89
+ # add a column for the retrieval context
90
+ def add_context(example):
91
+ example['context'] = format_reference([corpus[docs["id"]] for docs in retrieval_cache[example['question']][:args.topk]])
92
+ return example
93
+
94
+ train_dataset = train_dataset.map(function=add_context)
95
+ test_dataset = test_dataset.map(function=add_context)
96
+
97
+ # add a row to each data item that represents a unique id
98
+ def make_map_fn(split):
99
+
100
+ def process_fn(example, idx):
101
+ example['question'] = example['question'].strip()
102
+ if example['question'][-1] != '?':
103
+ example['question'] += '?'
104
+ question = make_prefix(example, template_type=args.template_type)
105
+ solution = {
106
+ "target": example['golden_answers'],
107
+ }
108
+
109
+ data = {
110
+ "data_source": data_source,
111
+ "prompt": [{
112
+ "role": "user",
113
+ "content": question,
114
+ }],
115
+ "ability": "fact-reasoning",
116
+ "reward_model": {
117
+ "style": "rule",
118
+ "ground_truth": solution
119
+ },
120
+ "extra_info": {
121
+ 'split': split,
122
+ 'index': idx,
123
+ }
124
+ }
125
+ return data
126
+
127
+ return process_fn
128
+
129
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
130
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
131
+
132
+ local_dir = args.local_dir
133
+ hdfs_dir = args.hdfs_dir
134
+
135
+ train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
136
+ test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
137
+
138
+ if hdfs_dir is not None:
139
+ makedirs(hdfs_dir)
140
+
141
+ copy(src=local_dir, dst=hdfs_dir)
code/RL_model/verl/Search-R1/misc/scripts/data_process/nq_search.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the nq dataset to parquet format
16
+ """
17
+
18
+ import re
19
+ import os
20
+ import datasets
21
+
22
+ from verl.utils.hdfs_io import copy, makedirs
23
+ import argparse
24
+
25
+
26
+ def make_prefix(dp, template_type):
27
+ question = dp['question']
28
+
29
+ # NOTE: also need to change reward_score/countdown.py
30
+ if template_type == 'base':
31
+ """This works for any base model"""
32
+ prefix = f"""Answer the given question. \
33
+ You must conduct reasoning inside <think> and </think> first every time you get new information. \
34
+ After reasoning, if you find you lack some knowledge, you can call a search engine by <search> query </search> and it will return the top searched results between <information> and </information>. \
35
+ You can search as many times as your want. \
36
+ If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>. Question: {question}\n"""
37
+ else:
38
+ raise NotImplementedError
39
+ return prefix
40
+
41
+
42
+ if __name__ == '__main__':
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument('--local_dir', default='./data/nq_search')
45
+ parser.add_argument('--hdfs_dir', default=None)
46
+ parser.add_argument('--template_type', type=str, default='base')
47
+
48
+ args = parser.parse_args()
49
+
50
+ data_source = 'nq'
51
+
52
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq')
53
+
54
+ train_dataset = dataset['train']
55
+ test_dataset = dataset['test']
56
+
57
+ # add a row to each data item that represents a unique id
58
+ def make_map_fn(split):
59
+
60
+ def process_fn(example, idx):
61
+ example['question'] = example['question'].strip()
62
+ if example['question'][-1] != '?':
63
+ example['question'] += '?'
64
+ question = make_prefix(example, template_type=args.template_type)
65
+ solution = {
66
+ "target": example['golden_answers'],
67
+ }
68
+
69
+ data = {
70
+ "data_source": data_source,
71
+ "prompt": [{
72
+ "role": "user",
73
+ "content": question,
74
+ }],
75
+ "ability": "fact-reasoning",
76
+ "reward_model": {
77
+ "style": "rule",
78
+ "ground_truth": solution
79
+ },
80
+ "extra_info": {
81
+ 'split': split,
82
+ 'index': idx,
83
+ }
84
+ }
85
+ return data
86
+
87
+ return process_fn
88
+
89
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
90
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
91
+
92
+ local_dir = args.local_dir
93
+ hdfs_dir = args.hdfs_dir
94
+
95
+ train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
96
+ test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
97
+
98
+ if hdfs_dir is not None:
99
+ makedirs(hdfs_dir)
100
+
101
+ copy(src=local_dir, dst=hdfs_dir)
code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_test_merge.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the QA dataset to parquet format
16
+ """
17
+
18
+ import re
19
+ import os
20
+ import datasets
21
+
22
+ from verl.utils.hdfs_io import copy, makedirs
23
+ import argparse
24
+
25
+
26
+ def make_prefix(dp, template_type):
27
+ question = dp['question']
28
+
29
+ # NOTE: also need to change reward_score/countdown.py
30
+ if template_type == 'base':
31
+ """This works for any base model"""
32
+ prefix = f"""Answer the given question. \
33
+ You must conduct reasoning inside <think> and </think> first every time you get new information. \
34
+ After reasoning, if you find you lack some knowledge, you can call a search engine by <search> query </search> and it will return the top searched results between <information> and </information>. \
35
+ You can search as many times as your want. \
36
+ If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>. Question: {question}\n"""
37
+ else:
38
+ raise NotImplementedError
39
+ return prefix
40
+
41
+
42
+ if __name__ == '__main__':
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument('--local_dir', default='./data/nq_search')
45
+ parser.add_argument('--hdfs_dir', default=None)
46
+ parser.add_argument('--template_type', type=str, default='base')
47
+ parser.add_argument('--data_sources', default='nq')
48
+
49
+ args = parser.parse_args()
50
+
51
+ data_sources = args.data_sources.split(',')
52
+ all_dataset = []
53
+
54
+ for data_source in data_sources:
55
+
56
+ if data_source != 'strategyqa':
57
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', data_source)
58
+ else:
59
+ dataset = datasets.load_dataset('json', data_files="/home/peterjin/mnt/data/strategyqa/test_correct.jsonl")
60
+
61
+ if 'test' in dataset:
62
+ print(f'Using the {data_source} test dataset...')
63
+ test_dataset = dataset['test']
64
+ elif 'dev' in dataset:
65
+ print(f'Using the {data_source} dev dataset...')
66
+ test_dataset = dataset['dev']
67
+ else:
68
+ print(f'Using the {data_source} train dataset...')
69
+ test_dataset = dataset['train']
70
+
71
+ # add a row to each data item that represents a unique id
72
+ def make_map_fn(split):
73
+
74
+ def process_fn(example, idx):
75
+ example['question'] = example['question'].strip()
76
+ if example['question'][-1] != '?':
77
+ example['question'] += '?'
78
+ question = make_prefix(example, template_type=args.template_type)
79
+ solution = {
80
+ "target": example['golden_answers'],
81
+ }
82
+
83
+ data = {
84
+ "data_source": data_source,
85
+ "prompt": [{
86
+ "role": "user",
87
+ "content": question,
88
+ }],
89
+ "ability": "fact-reasoning",
90
+ "reward_model": {
91
+ "style": "rule",
92
+ "ground_truth": solution
93
+ },
94
+ "extra_info": {
95
+ 'split': split,
96
+ 'index': idx,
97
+ }
98
+ }
99
+ return data
100
+
101
+ return process_fn
102
+
103
+ test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
104
+ all_dataset.append(test_dataset)
105
+
106
+ local_dir = args.local_dir
107
+ hdfs_dir = args.hdfs_dir
108
+
109
+ all_test_dataset = datasets.concatenate_datasets(all_dataset)
110
+ all_test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
111
+
112
+ if hdfs_dir is not None:
113
+ makedirs(hdfs_dir)
114
+
115
+ copy(src=local_dir, dst=hdfs_dir)
code/RL_model/verl/Search-R1/misc/scripts/data_process/qa_search_train_merge.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the QA dataset to parquet format
16
+ """
17
+
18
+ import re
19
+ import os
20
+ import datasets
21
+
22
+ from verl.utils.hdfs_io import copy, makedirs
23
+ import argparse
24
+
25
+
26
+ def make_prefix(dp, template_type):
27
+ question = dp['question']
28
+
29
+ # NOTE: also need to change reward_score/countdown.py
30
+ if template_type == 'base':
31
+ """This works for any base model"""
32
+ prefix = f"""Answer the given question. \
33
+ You must conduct reasoning inside <think> and </think> first every time you get new information. \
34
+ After reasoning, if you find you lack some knowledge, you can call a search engine by <search> query </search> and it will return the top searched results between <information> and </information>. \
35
+ You can search as many times as your want. \
36
+ If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>. Question: {question}\n"""
37
+ else:
38
+ raise NotImplementedError
39
+ return prefix
40
+
41
+
42
+ if __name__ == '__main__':
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument('--local_dir', default='./data/nq_search')
45
+ parser.add_argument('--hdfs_dir', default=None)
46
+ parser.add_argument('--template_type', type=str, default='base')
47
+ parser.add_argument('--data_sources', default='nq')
48
+
49
+ args = parser.parse_args()
50
+
51
+ # data_source = 'nq'
52
+ data_sources = args.data_sources.split(',')
53
+ all_dataset = []
54
+
55
+ for data_source in data_sources:
56
+
57
+ dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', data_source)
58
+
59
+ train_dataset = dataset['train']
60
+
61
+ # add a row to each data item that represents a unique id
62
+ def make_map_fn(split):
63
+
64
+ def process_fn(example, idx):
65
+ example['question'] = example['question'].strip()
66
+ if example['question'][-1] != '?':
67
+ example['question'] += '?'
68
+ question = make_prefix(example, template_type=args.template_type)
69
+ solution = {
70
+ "target": example['golden_answers'],
71
+ }
72
+
73
+ data = {
74
+ "data_source": data_source,
75
+ "prompt": [{
76
+ "role": "user",
77
+ "content": question,
78
+ }],
79
+ "ability": "fact-reasoning",
80
+ "reward_model": {
81
+ "style": "rule",
82
+ "ground_truth": solution
83
+ },
84
+ "extra_info": {
85
+ 'split': split,
86
+ 'index': idx,
87
+ }
88
+ }
89
+ return data
90
+
91
+ return process_fn
92
+
93
+ train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
94
+ all_dataset.append(train_dataset)
95
+
96
+ local_dir = args.local_dir
97
+ hdfs_dir = args.hdfs_dir
98
+
99
+ all_train_dataset = datasets.concatenate_datasets(all_dataset)
100
+ all_train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
101
+
102
+ if hdfs_dir is not None:
103
+ makedirs(hdfs_dir)
104
+
105
+ copy(src=local_dir, dst=hdfs_dir)
code/RL_model/verl/Search-R1/misc/scripts/download.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from huggingface_hub import hf_hub_download
3
+
4
+ parser = argparse.ArgumentParser(description="Download files from a Hugging Face dataset repository.")
5
+ parser.add_argument("--repo_id", type=str, default="PeterJinGo/wiki-18-e5-index", help="Hugging Face repository ID")
6
+ parser.add_argument("--save_path", type=str, required=True, help="Local directory to save files")
7
+
8
+ args = parser.parse_args()
9
+
10
+ repo_id = "PeterJinGo/wiki-18-e5-index"
11
+ for file in ["part_aa", "part_ab"]:
12
+ hf_hub_download(
13
+ repo_id=repo_id,
14
+ filename=file, # e.g., "e5_Flat.index"
15
+ repo_type="dataset",
16
+ local_dir=args.save_path,
17
+ )
18
+
19
+ repo_id = "PeterJinGo/wiki-18-corpus"
20
+ hf_hub_download(
21
+ repo_id=repo_id,
22
+ filename="wiki-18.jsonl.gz",
23
+ repo_type="dataset",
24
+ local_dir=args.save_path,
25
+ )
code/RL_model/verl/Search-R1/misc/scripts/download.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ save_path=/home/peterjin/debug_cache
3
+
4
+ python download.py --savepath $savepath
5
+
6
+ cat $save_path/part_* > e5_Flat.index
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Reproduce the paper results
3
+
4
+ ### Download the dataset
5
+
6
+ ```bash
7
+ huggingface-cli download --repo-type dataset PeterJinGo/nq_hotpotqa_train --local-dir $WORK_DIR/data/nq_hotpotqa_train
8
+ ```
9
+
10
+ ### Launch the local search engine
11
+
12
+ (1) Download the indexing and corpus.
13
+ ```bash
14
+ save_path=/the/path/to/save
15
+ python scripts/download.py --save_path $save_path
16
+ cat $save_path/part_* > $save_path/e5_Flat.index
17
+ gzip -d $save_path/wiki-18.jsonl.gz
18
+ ```
19
+
20
+ (2) Launch a local retrieval server.
21
+ ```bash
22
+ conda activate retriever
23
+ bash retrieval_launch.sh
24
+ ```
25
+
26
+ ### Run PPO training
27
+ ```bash
28
+ bash train_ppo.sh
29
+ ```
30
+
31
+
32
+ ### Run GRPO training
33
+ ```bash
34
+ bash train_grpo.sh
35
+ ```
36
+
37
+ ### Run evaluation
38
+ ```bash
39
+ bash evaluate.sh
40
+ ```
41
+
42
+ You can change ```$BASE_MODEL``` to the path of the model you would like to evaluate.
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/data_process.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ WORK_DIR=your/work/dir
2
+ LOCAL_DIR=$WORK_DIR/data/nq_hotpotqa_train
3
+
4
+ ## process multiple dataset search format train file
5
+ DATA=nq,hotpotqa
6
+ python $WORK_DIR/scripts/data_process/qa_search_train_merge.py --local_dir $LOCAL_DIR --data_sources $DATA
7
+
8
+ ## process multiple dataset search format test file
9
+ DATA=nq,triviaqa,popqa,hotpotqa,2wikimultihopqa,musique,bamboogle
10
+ python $WORK_DIR/scripts/data_process/qa_search_test_merge.py --local_dir $LOCAL_DIR --data_sources $DATA
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/evaluate.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ export BASE_MODEL=""
7
+
8
+ # set -x
9
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
10
+
11
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
12
+
13
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
14
+ data.train_files=$DATA_DIR/train.parquet \
15
+ data.val_files=$DATA_DIR/test.parquet \
16
+ data.train_data_num=null \
17
+ data.val_data_num=null \
18
+ data.train_batch_size=512 \
19
+ data.val_batch_size=256 \
20
+ data.max_prompt_length=4096 \
21
+ data.max_response_length=500 \
22
+ data.max_start_length=2048 \
23
+ data.max_obs_length=500 \
24
+ data.shuffle_train_dataloader=True \
25
+ algorithm.adv_estimator=gae \
26
+ actor_rollout_ref.model.path=$BASE_MODEL \
27
+ actor_rollout_ref.actor.optim.lr=1e-6 \
28
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
29
+ actor_rollout_ref.model.use_remove_padding=True \
30
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.95 \
31
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
32
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
33
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
34
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
35
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
36
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
37
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
38
+ actor_rollout_ref.rollout.name=vllm \
39
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
40
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
41
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
42
+ actor_rollout_ref.rollout.n_agent=1 \
43
+ actor_rollout_ref.rollout.temperature=1 \
44
+ actor_rollout_ref.actor.state_masking=true \
45
+ critic.optim.lr=1e-5 \
46
+ critic.model.use_remove_padding=True \
47
+ critic.optim.lr_warmup_steps_ratio=0.05 \
48
+ critic.model.path=$BASE_MODEL \
49
+ critic.model.enable_gradient_checkpointing=true \
50
+ critic.ppo_micro_batch_size=8 \
51
+ critic.model.fsdp_config.param_offload=true \
52
+ critic.model.fsdp_config.grad_offload=true \
53
+ critic.model.fsdp_config.optimizer_offload=true \
54
+ algorithm.kl_ctrl.kl_coef=0.001 \
55
+ algorithm.no_think_rl=false \
56
+ trainer.critic_warmup=0 \
57
+ trainer.logger=[] \
58
+ +trainer.val_only=true \
59
+ +trainer.val_before_train=true \
60
+ trainer.default_hdfs_dir=null \
61
+ trainer.n_gpus_per_node=8 \
62
+ trainer.nnodes=1 \
63
+ max_turns=4 \
64
+ retriever.url="http://127.0.0.1:8000/retrieve" \
65
+ retriever.topk=3
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_grpo.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+
8
+ export BASE_MODEL='meta-llama/Llama-3.2-3B'
9
+ export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.2-3b-em
10
+ # export BASE_MODEL='meta-llama/Llama-3.2-3B-Instruct'
11
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.2-3b-it-em
12
+ # export BASE_MODEL='meta-llama/Llama-3.1-8B'
13
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.1-8b-em
14
+ # export BASE_MODEL='meta-llama/Llama-3.1-8B-Instruct'
15
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-llama3.1-8b-it-em
16
+
17
+ # export BASE_MODEL='Qwen/Qwen2.5-3B'
18
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-em
19
+ # export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
20
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-it-em
21
+ # export BASE_MODEL='Qwen/Qwen2.5-7B'
22
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-em
23
+ # export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
24
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-it-em
25
+
26
+ # set -x
27
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
28
+
29
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
30
+
31
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
32
+ data.train_files=$DATA_DIR/train.parquet \
33
+ data.val_files=$DATA_DIR/test.parquet \
34
+ data.train_data_num=null \
35
+ data.val_data_num=null \
36
+ data.train_batch_size=512 \
37
+ data.val_batch_size=256 \
38
+ data.max_prompt_length=4096 \
39
+ data.max_response_length=500 \
40
+ data.max_start_length=2048 \
41
+ data.max_obs_length=500 \
42
+ data.shuffle_train_dataloader=True \
43
+ algorithm.adv_estimator=grpo \
44
+ actor_rollout_ref.model.path=$BASE_MODEL \
45
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
46
+ actor_rollout_ref.model.use_remove_padding=True \
47
+ actor_rollout_ref.actor.optim.lr=1e-6 \
48
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.95 \
49
+ actor_rollout_ref.actor.use_kl_loss=true \
50
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
51
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
52
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
53
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
54
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
55
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
56
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
57
+ actor_rollout_ref.rollout.name=vllm \
58
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
59
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
60
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
61
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
62
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
63
+ algorithm.no_think_rl=false \
64
+ actor_rollout_ref.rollout.n_agent=5 \
65
+ actor_rollout_ref.rollout.temperature=1 \
66
+ actor_rollout_ref.actor.state_masking=true \
67
+ trainer.logger=['wandb'] \
68
+ +trainer.val_only=false \
69
+ +trainer.val_before_train=true \
70
+ trainer.default_hdfs_dir=null \
71
+ trainer.n_gpus_per_node=8 \
72
+ trainer.nnodes=1 \
73
+ trainer.save_freq=100 \
74
+ trainer.test_freq=50 \
75
+ trainer.project_name=$WAND_PROJECT \
76
+ trainer.experiment_name=$EXPERIMENT_NAME \
77
+ trainer.total_epochs=15 \
78
+ trainer.total_training_steps=305 \
79
+ trainer.default_hdfs_dir=null \
80
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
81
+ max_turns=4 \
82
+ retriever.url="http://127.0.0.1:8000/retrieve" \
83
+ retriever.topk=3 \
84
+ 2>&1 | tee $EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.1/train_ppo.sh ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+
8
+ export BASE_MODEL='meta-llama/Llama-3.2-3B'
9
+ export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.2-3b-em
10
+ # export BASE_MODEL='meta-llama/Llama-3.2-3B-Instruct'
11
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.2-3b-it-em
12
+ # export BASE_MODEL='meta-llama/Llama-3.1-8B'
13
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.1-8b-em
14
+ # export BASE_MODEL='meta-llama/Llama-3.1-8B-Instruct'
15
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-llama3.1-8b-it-em
16
+
17
+ # export BASE_MODEL='Qwen/Qwen2.5-3B'
18
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-em
19
+ # export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
20
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-it-em
21
+ # export BASE_MODEL='Qwen/Qwen2.5-7B'
22
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-em
23
+ # export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
24
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-it-em
25
+
26
+ # set -x
27
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
28
+
29
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
30
+
31
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
32
+ data.train_files=$DATA_DIR/train.parquet \
33
+ data.val_files=$DATA_DIR/test.parquet \
34
+ data.train_data_num=null \
35
+ data.val_data_num=null \
36
+ data.train_batch_size=512 \
37
+ data.val_batch_size=256 \
38
+ data.max_prompt_length=4096 \
39
+ data.max_response_length=500 \
40
+ data.max_start_length=2048 \
41
+ data.max_obs_length=500 \
42
+ data.shuffle_train_dataloader=True \
43
+ algorithm.adv_estimator=gae \
44
+ actor_rollout_ref.model.path=$BASE_MODEL \
45
+ actor_rollout_ref.actor.optim.lr=1e-6 \
46
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
47
+ actor_rollout_ref.model.use_remove_padding=True \
48
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.95 \
49
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
50
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
51
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
52
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
53
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
54
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
55
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
56
+ actor_rollout_ref.rollout.name=vllm \
57
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
58
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
59
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
60
+ actor_rollout_ref.rollout.n_agent=1 \
61
+ actor_rollout_ref.rollout.temperature=1 \
62
+ actor_rollout_ref.actor.state_masking=true \
63
+ critic.optim.lr=1e-5 \
64
+ critic.model.use_remove_padding=True \
65
+ critic.optim.lr_warmup_steps_ratio=0.05 \
66
+ critic.model.path=$BASE_MODEL \
67
+ critic.model.enable_gradient_checkpointing=true \
68
+ critic.ppo_micro_batch_size=8 \
69
+ critic.model.fsdp_config.param_offload=true \
70
+ critic.model.fsdp_config.grad_offload=true \
71
+ critic.model.fsdp_config.optimizer_offload=true \
72
+ algorithm.kl_ctrl.kl_coef=0.001 \
73
+ algorithm.no_think_rl=false \
74
+ trainer.critic_warmup=0 \
75
+ trainer.logger=['wandb'] \
76
+ +trainer.val_only=false \
77
+ +trainer.val_before_train=true \
78
+ trainer.default_hdfs_dir=null \
79
+ trainer.n_gpus_per_node=8 \
80
+ trainer.nnodes=1 \
81
+ trainer.save_freq=100 \
82
+ trainer.test_freq=50 \
83
+ trainer.project_name=$WAND_PROJECT \
84
+ trainer.experiment_name=$EXPERIMENT_NAME \
85
+ trainer.total_epochs=15 \
86
+ trainer.total_training_steps=305 \
87
+ trainer.default_hdfs_dir=null \
88
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
89
+ max_turns=4 \
90
+ retriever.url="http://127.0.0.1:8000/retrieve" \
91
+ retriever.topk=3 \
92
+ 2>&1 | tee $EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_grpo.sh ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+
8
+ # export BASE_MODEL='Qwen/Qwen2.5-3B'
9
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-em
10
+ # export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
11
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-it-em
12
+ export BASE_MODEL='Qwen/Qwen2.5-7B'
13
+ export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-em
14
+ # export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
15
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-it-em
16
+ # export BASE_MODEL='Qwen/Qwen2.5-14B'
17
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-em
18
+ # export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
19
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-it-em
20
+
21
+ # set -x
22
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
23
+
24
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
25
+
26
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
27
+ data.train_files=$DATA_DIR/train.parquet \
28
+ data.val_files=$DATA_DIR/test.parquet \
29
+ data.train_data_num=null \
30
+ data.val_data_num=null \
31
+ data.train_batch_size=512 \
32
+ data.val_batch_size=256 \
33
+ data.max_prompt_length=4096 \
34
+ data.max_response_length=500 \
35
+ data.max_start_length=2048 \
36
+ data.max_obs_length=500 \
37
+ data.shuffle_train_dataloader=True \
38
+ algorithm.adv_estimator=grpo \
39
+ actor_rollout_ref.model.path=$BASE_MODEL \
40
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
41
+ actor_rollout_ref.model.use_remove_padding=True \
42
+ actor_rollout_ref.actor.optim.lr=1e-6 \
43
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
44
+ actor_rollout_ref.actor.use_kl_loss=true \
45
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
46
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
47
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
48
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
49
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
50
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
51
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
52
+ actor_rollout_ref.rollout.name=vllm \
53
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
54
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
55
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
56
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
57
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
58
+ algorithm.no_think_rl=false \
59
+ actor_rollout_ref.rollout.n_agent=5 \
60
+ actor_rollout_ref.rollout.temperature=1 \
61
+ actor_rollout_ref.actor.state_masking=true \
62
+ trainer.logger=['wandb'] \
63
+ +trainer.val_only=false \
64
+ +trainer.val_before_train=true \
65
+ trainer.default_hdfs_dir=null \
66
+ trainer.n_gpus_per_node=8 \
67
+ trainer.nnodes=1 \
68
+ trainer.save_freq=100 \
69
+ trainer.test_freq=100 \
70
+ trainer.project_name=$WAND_PROJECT \
71
+ trainer.experiment_name=$EXPERIMENT_NAME \
72
+ trainer.total_epochs=15 \
73
+ trainer.total_training_steps=1005 \
74
+ trainer.default_hdfs_dir=null \
75
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
76
+ max_turns=4 \
77
+ retriever.url="http://127.0.0.1:8000/retrieve" \
78
+ retriever.topk=3 \
79
+ 2>&1 | tee $EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.2/train_ppo.sh ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+
8
+ # export BASE_MODEL='Qwen/Qwen2.5-3B'
9
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-em
10
+ # export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
11
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-it-em
12
+ export BASE_MODEL='Qwen/Qwen2.5-7B'
13
+ export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-em
14
+ # export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
15
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-it-em
16
+ # export BASE_MODEL='Qwen/Qwen2.5-14B'
17
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-14b-em
18
+ # export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
19
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-14b-it-em
20
+
21
+ # set -x
22
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
23
+
24
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
25
+
26
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
27
+ data.train_files=$DATA_DIR/train.parquet \
28
+ data.val_files=$DATA_DIR/test.parquet \
29
+ data.train_data_num=null \
30
+ data.val_data_num=null \
31
+ data.train_batch_size=512 \
32
+ data.val_batch_size=256 \
33
+ data.max_prompt_length=4096 \
34
+ data.max_response_length=500 \
35
+ data.max_start_length=2048 \
36
+ data.max_obs_length=500 \
37
+ data.shuffle_train_dataloader=True \
38
+ algorithm.adv_estimator=gae \
39
+ actor_rollout_ref.model.path=$BASE_MODEL \
40
+ actor_rollout_ref.actor.optim.lr=1e-6 \
41
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
42
+ actor_rollout_ref.model.use_remove_padding=True \
43
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
44
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
45
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
46
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
47
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
48
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
49
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
50
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
51
+ actor_rollout_ref.rollout.name=vllm \
52
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
53
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
54
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
55
+ actor_rollout_ref.rollout.n_agent=1 \
56
+ actor_rollout_ref.rollout.temperature=1 \
57
+ actor_rollout_ref.rollout.top_p=1.0 \
58
+ actor_rollout_ref.actor.state_masking=true \
59
+ critic.optim.lr=1e-5 \
60
+ critic.model.use_remove_padding=True \
61
+ critic.optim.lr_warmup_steps_ratio=0.015 \
62
+ critic.model.path=$BASE_MODEL \
63
+ critic.model.enable_gradient_checkpointing=true \
64
+ critic.ppo_micro_batch_size=8 \
65
+ critic.model.fsdp_config.param_offload=true \
66
+ critic.model.fsdp_config.grad_offload=true \
67
+ critic.model.fsdp_config.optimizer_offload=true \
68
+ algorithm.kl_ctrl.kl_coef=0.001 \
69
+ algorithm.no_think_rl=false \
70
+ trainer.critic_warmup=0 \
71
+ trainer.logger=['wandb'] \
72
+ +trainer.val_only=false \
73
+ +trainer.val_before_train=true \
74
+ trainer.default_hdfs_dir=null \
75
+ trainer.n_gpus_per_node=8 \
76
+ trainer.nnodes=1 \
77
+ trainer.save_freq=100 \
78
+ trainer.test_freq=100 \
79
+ trainer.project_name=$WAND_PROJECT \
80
+ trainer.experiment_name=$EXPERIMENT_NAME \
81
+ trainer.total_epochs=15 \
82
+ trainer.total_training_steps=1005 \
83
+ trainer.default_hdfs_dir=null \
84
+ trainer.default_local_dir=verl_checkpoints/$EXPERIMENT_NAME \
85
+ max_turns=4 \
86
+ retriever.url="http://127.0.0.1:8000/retrieve" \
87
+ retriever.topk=3 \
88
+ 2>&1 | tee $EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_grpo_format.sh ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+
8
+ export BASE_MODEL='Qwen/Qwen2.5-3B'
9
+ export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-em-structureformat
10
+ # export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
11
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-3b-it-em-structureformat
12
+ # export BASE_MODEL='Qwen/Qwen2.5-7B'
13
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-em-structureformat
14
+ # export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
15
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-7b-it-em-structureformat
16
+ # export BASE_MODEL='Qwen/Qwen2.5-14B'
17
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-em-structureformat
18
+ # export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
19
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-it-em-structureformat
20
+
21
+ # export BASE_MODEL='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'
22
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-deepseekr1-7b-em-structureformat
23
+ # export BASE_MODEL='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B'
24
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-deepseekr1-14b-em-structureformat
25
+
26
+ # set -x
27
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
28
+
29
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
30
+
31
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo_format \
32
+ data.train_files=$DATA_DIR/train.parquet \
33
+ data.val_files=$DATA_DIR/test.parquet \
34
+ data.train_data_num=null \
35
+ data.val_data_num=null \
36
+ data.train_batch_size=512 \
37
+ data.val_batch_size=256 \
38
+ data.max_prompt_length=4096 \
39
+ data.max_response_length=500 \
40
+ data.max_start_length=2048 \
41
+ data.max_obs_length=500 \
42
+ data.shuffle_train_dataloader=True \
43
+ algorithm.adv_estimator=grpo \
44
+ actor_rollout_ref.model.path=$BASE_MODEL \
45
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
46
+ actor_rollout_ref.model.use_remove_padding=True \
47
+ actor_rollout_ref.actor.optim.lr=5e-7 \
48
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
49
+ actor_rollout_ref.actor.use_kl_loss=true \
50
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
51
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
52
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
53
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
54
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
55
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
56
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
57
+ actor_rollout_ref.rollout.name=vllm \
58
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
59
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
60
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
61
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
62
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
63
+ algorithm.no_think_rl=false \
64
+ actor_rollout_ref.rollout.n_agent=5 \
65
+ actor_rollout_ref.rollout.temperature=1 \
66
+ actor_rollout_ref.actor.state_masking=true \
67
+ trainer.logger=['wandb'] \
68
+ +trainer.val_only=false \
69
+ +trainer.val_before_train=true \
70
+ trainer.default_hdfs_dir=null \
71
+ trainer.n_gpus_per_node=8 \
72
+ trainer.nnodes=1 \
73
+ trainer.save_freq=100 \
74
+ trainer.test_freq=100 \
75
+ trainer.project_name=$WAND_PROJECT \
76
+ trainer.experiment_name=$EXPERIMENT_NAME \
77
+ trainer.total_epochs=15 \
78
+ trainer.total_training_steps=1005 \
79
+ trainer.default_hdfs_dir=null \
80
+ trainer.default_local_dir=/home/peterjin/verl_checkpoints/$EXPERIMENT_NAME \
81
+ reward_model.structure_format_score=0.2 \
82
+ reward_model.final_format_score=0.1 \
83
+ reward_model.retrieval_score=0 \
84
+ max_turns=4 \
85
+ retriever.url="http://127.0.0.1:8000/retrieve" \
86
+ retriever.topk=3 \
87
+ 2>&1 | tee /home/peterjin/rl_logs/$EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/scripts/nq_hotpotqa/v0.3/train_ppo_format.sh ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_name=nq_hotpotqa_train
2
+
3
+ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
4
+ export DATA_DIR=data/${data_name} # first download the data from https://huggingface.co/datasets/PeterJinGo/nq_hotpotqa_train
5
+
6
+ WAND_PROJECT="Search-R1"
7
+
8
+ export BASE_MODEL='Qwen/Qwen2.5-3B'
9
+ export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-em-structureformat
10
+ # export BASE_MODEL='Qwen/Qwen2.5-3B-Instruct'
11
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-3b-it-em-structureformat
12
+ # export BASE_MODEL='Qwen/Qwen2.5-7B'
13
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-em-structureformat
14
+ # export BASE_MODEL='Qwen/Qwen2.5-7B-Instruct'
15
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-qwen2.5-7b-it-em-structureformat
16
+ # export BASE_MODEL='Qwen/Qwen2.5-14B'
17
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-em-structureformat
18
+ # export BASE_MODEL='Qwen/Qwen2.5-14B-Instruct'
19
+ # export EXPERIMENT_NAME=${data_name}-search-r1-grpo-qwen2.5-14b-it-em-structureformat
20
+
21
+ # export BASE_MODEL='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B'
22
+ # export EXPERIMENT_NAME=${data_name}-search-r1-ppo-deepseekr1-14b-em-structureformat
23
+
24
+ # set -x
25
+ export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
26
+
27
+ # max_prompt_length = (config['training']['max_start_length'] + config['training']['max_response_length'] * (config['training']['max_turns'] - 1) + config['training']['max_obs_length'] * config['training']['max_turns'])
28
+
29
+ PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo_format \
30
+ data.train_files=$DATA_DIR/train.parquet \
31
+ data.val_files=$DATA_DIR/test.parquet \
32
+ data.train_data_num=null \
33
+ data.val_data_num=null \
34
+ data.train_batch_size=512 \
35
+ data.val_batch_size=256 \
36
+ data.max_prompt_length=4096 \
37
+ data.max_response_length=500 \
38
+ data.max_start_length=2048 \
39
+ data.max_obs_length=500 \
40
+ data.shuffle_train_dataloader=True \
41
+ algorithm.adv_estimator=gae \
42
+ actor_rollout_ref.model.path=$BASE_MODEL \
43
+ actor_rollout_ref.actor.optim.lr=1e-6 \
44
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
45
+ actor_rollout_ref.model.use_remove_padding=True \
46
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.285 \
47
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
48
+ actor_rollout_ref.actor.ppo_micro_batch_size=64 \
49
+ actor_rollout_ref.actor.fsdp_config.param_offload=true \
50
+ actor_rollout_ref.actor.fsdp_config.grad_offload=true \
51
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=true \
52
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
53
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
54
+ actor_rollout_ref.rollout.name=vllm \
55
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
56
+ actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
57
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
58
+ actor_rollout_ref.rollout.n_agent=1 \
59
+ actor_rollout_ref.rollout.temperature=1 \
60
+ actor_rollout_ref.rollout.top_p=1.0 \
61
+ actor_rollout_ref.actor.state_masking=true \
62
+ critic.optim.lr=1e-5 \
63
+ critic.model.use_remove_padding=True \
64
+ critic.optim.lr_warmup_steps_ratio=0.015 \
65
+ critic.model.path=$BASE_MODEL \
66
+ critic.model.enable_gradient_checkpointing=true \
67
+ critic.ppo_micro_batch_size=8 \
68
+ critic.model.fsdp_config.param_offload=true \
69
+ critic.model.fsdp_config.grad_offload=true \
70
+ critic.model.fsdp_config.optimizer_offload=true \
71
+ algorithm.kl_ctrl.kl_coef=0.001 \
72
+ algorithm.no_think_rl=false \
73
+ trainer.critic_warmup=0 \
74
+ trainer.logger=['wandb'] \
75
+ +trainer.val_only=false \
76
+ +trainer.val_before_train=true \
77
+ trainer.default_hdfs_dir=null \
78
+ trainer.n_gpus_per_node=8 \
79
+ trainer.nnodes=1 \
80
+ trainer.save_freq=100 \
81
+ trainer.test_freq=100 \
82
+ trainer.project_name=$WAND_PROJECT \
83
+ trainer.experiment_name=$EXPERIMENT_NAME \
84
+ trainer.total_epochs=15 \
85
+ trainer.total_training_steps=1005 \
86
+ trainer.default_hdfs_dir=null \
87
+ trainer.default_local_dir=/home/peterjin/verl_checkpoints/$EXPERIMENT_NAME \
88
+ reward_model.structure_format_score=0.2 \
89
+ reward_model.final_format_score=0.1 \
90
+ reward_model.retrieval_score=0 \
91
+ max_turns=4 \
92
+ retriever.url="http://127.0.0.1:8000/retrieve" \
93
+ retriever.topk=3 \
94
+ 2>&1 | tee /home/peterjin/rl_logs/$EXPERIMENT_NAME.log
code/RL_model/verl/Search-R1/misc/scripts/upload.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import upload_file
3
+
4
+ repo_id = "PeterJinGo/wiki-18-e5-index"
5
+ path = "/home/peterjin/mnt/index/wiki-18"
6
+ for file in ["part_aa", "part_ab"]:
7
+ upload_file(
8
+ path_or_fileobj=os.path.join(path, file), # File path
9
+ path_in_repo=file, # Destination filename in the repo
10
+ repo_id=repo_id, # Your dataset repo ID
11
+ repo_type="dataset"
12
+ )
code/RL_model/verl/Search-R1/misc/scripts/upload.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ index=/home/peterjin/mnt/index/wiki-18/e5_Flat.index
3
+
4
+ split -b 40G $index part_
5
+
6
+ python upload.py
code/RL_model/verl/Search-R1/verl/models/README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models
2
+ Common modelzoo such as huggingface/transformers stuggles when using Pytorch native model parallelism. Following the design principle of vLLM, we keep a simple, parallelizable, highly-optimized with packed inputs in verl.
3
+ ## Adding a New Huggingface Model
4
+ ### Step 1: Copy the model file from HF to verl
5
+ - Add a new file under verl/models/hf
6
+ - Copy ONLY the model file from huggingface/transformers/models to verl/models/hf
7
+
8
+ ### Step 2: Modify the model file to use packed inputs
9
+ - Remove all the code related to inference (kv cache)
10
+ - Modify the inputs to include only
11
+ - input_ids (total_nnz,)
12
+ - cu_seqlens (total_nnz + 1,)
13
+ - max_seqlen_in_batch: int
14
+ - Note that this requires using flash attention with causal mask.
15
+
16
+ ### Step 2.5: Add tests
17
+ - Add a test to compare this version and the huggingface version
18
+ - Following the infrastructure and add tests to tests/models/hf
19
+
20
+ ### Step 3: Add a function to apply tensor parallelism
21
+ - Please follow
22
+ - https://pytorch.org/docs/stable/distributed.tensor.parallel.html
23
+ - https://pytorch.org/tutorials/intermediate/TP_tutorial.html
24
+ - General comments
25
+ - Tensor Parallelism in native Pytorch is NOT auto-parallelism. The way it works is to specify how model parameters and input/output reshards using configs. These configs are then registered as hooks to perform input/output resharding before/after model forward.
26
+
27
+ ### Step 4: Add a function to apply data parallelism
28
+ - Please use FSDP2 APIs
29
+ - See demo here https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py#L413
30
+
31
+ ### Step 5: Add a function to apply pipeline parallelism
32
+ - Comes in Pytorch 2.4
33
+ - Currently only in alpha in nightly version
34
+ - Check torchtitan for more details
35
+
code/RL_model/verl/Search-R1/verl/models/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
code/RL_model/verl/Search-R1/verl/models/llama/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.