Lekr0 commited on
Commit
a402b9b
·
verified ·
1 Parent(s): 61ba51e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/README.md +207 -0
  2. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/adapter_config.json +43 -0
  3. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/README.md +207 -0
  4. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/adapter_config.json +43 -0
  5. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/README.md +207 -0
  6. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/adapter_config.json +43 -0
  7. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/README.md +207 -0
  8. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/adapter_config.json +43 -0
  9. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/README.md +207 -0
  10. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/adapter_config.json +43 -0
  11. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/README.md +207 -0
  12. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/adapter_config.json +43 -0
  13. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_112500/adapter_config.json +43 -0
  14. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/README.md +207 -0
  15. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/adapter_config.json +43 -0
  16. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_144500/README.md +207 -0
  17. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/README.md +207 -0
  18. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/adapter_config.json +43 -0
  19. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_164000/README.md +207 -0
  20. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/README.md +207 -0
  21. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/adapter_config.json +43 -0
  22. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/README.md +207 -0
  23. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/adapter_config.json +43 -0
  24. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/README.md +207 -0
  25. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/adapter_config.json +43 -0
  26. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/README.md +207 -0
  27. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/adapter_config.json +43 -0
  28. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/README.md +207 -0
  29. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/adapter_config.json +43 -0
  30. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/README.md +207 -0
  31. progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/adapter_config.json +43 -0
  32. sglang/benchmark/asr/README.md +166 -0
  33. sglang/benchmark/asr/bench_sglang.py +404 -0
  34. sglang/benchmark/bench_attention_sink/bench_attention_sink_triton.py +250 -0
  35. sglang/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py +130 -0
  36. sglang/benchmark/bench_rope/benchmark_rope_index.py +425 -0
  37. sglang/benchmark/benchmark_batch/benchmark_batch.py +193 -0
  38. sglang/benchmark/benchmark_batch/benchmark_tokenizer.py +237 -0
  39. sglang/benchmark/benchmark_vllm_060/README.md +89 -0
  40. sglang/benchmark/blog_v0_2/405b_sglang.sh +24 -0
  41. sglang/benchmark/blog_v0_2/405b_trt.sh +17 -0
  42. sglang/benchmark/blog_v0_2/405b_vllm.sh +24 -0
  43. sglang/benchmark/blog_v0_2/README.md +164 -0
  44. sglang/benchmark/blog_v0_2/config.md +100 -0
  45. sglang/benchmark/boolq/README.md +19 -0
  46. sglang/benchmark/boolq/bench_sglang.py +124 -0
  47. sglang/benchmark/boolq/convert_parquet_to_json.py +28 -0
  48. sglang/benchmark/boolq/parquet_to_json.sh +26 -0
  49. sglang/benchmark/ceval/README.md +15 -0
  50. sglang/benchmark/ceval/bench_sglang.py +138 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_112500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_144500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_164000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /workspace/Qwen3-8B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/workspace/Qwen3-8B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/workspace/Qwen3-8B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "k_proj",
34
+ "o_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
sglang/benchmark/asr/README.md ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ASR Benchmark
2
+
3
+ This benchmark evaluates the performance and accuracy (Word Error Rate - WER) of Automatic Speech Recognition (ASR) models served via SGLang.
4
+
5
+ ## Supported Models
6
+
7
+ - `openai/whisper-large-v3`
8
+ - `openai/whisper-large-v3-turbo`
9
+
10
+ ## Setup
11
+
12
+ Install the required dependencies:
13
+
14
+ ```bash
15
+ apt install ffmpeg
16
+ pip install librosa soundfile datasets evaluate jiwer transformers openai torchcodec torch
17
+ ```
18
+
19
+ ## Running the Benchmark
20
+
21
+ ### 1. Start SGLang Server
22
+
23
+ Launch the SGLang server with a Whisper model:
24
+
25
+ ```bash
26
+ python -m sglang.launch_server --model-path openai/whisper-large-v3 --port 30000
27
+ ```
28
+
29
+ ### 2. Run the Benchmark Script
30
+
31
+ Basic usage (using chat completions API):
32
+
33
+ ```bash
34
+ python bench_sglang.py --base-url http://localhost:30000 --model openai/whisper-large-v3 --n-examples 10
35
+ ```
36
+
37
+ Using the OpenAI-compatible transcription API:
38
+
39
+ ```bash
40
+ python bench_sglang.py \
41
+ --base-url http://localhost:30000 \
42
+ --model openai/whisper-large-v3 \
43
+ --api-type transcription \
44
+ --language English \
45
+ --n-examples 10
46
+ ```
47
+
48
+ Run with streaming and show real-time output:
49
+
50
+ ```bash
51
+ python bench_sglang.py \
52
+ --base-url http://localhost:30000 \
53
+ --model openai/whisper-large-v3 \
54
+ --api-type transcription \
55
+ --stream \
56
+ --show-predictions \
57
+ --concurrency 1
58
+ ```
59
+
60
+ Run with higher concurrency and save results:
61
+
62
+ ```bash
63
+ python bench_sglang.py \
64
+ --base-url http://localhost:30000 \
65
+ --model openai/whisper-large-v3 \
66
+ --concurrency 8 \
67
+ --n-examples 100 \
68
+ --output results.json \
69
+ --show-predictions
70
+ ```
71
+
72
+ ## Arguments
73
+
74
+ | Argument | Description | Default |
75
+ |----------|-------------|---------|
76
+ | `--base-url` | SGLang server URL | `http://localhost:30000` |
77
+ | `--model` | Model name on the server | `openai/whisper-large-v3` |
78
+ | `--dataset` | HuggingFace dataset for evaluation | `D4nt3/esb-datasets-earnings22-validation-tiny-filtered` |
79
+ | `--split` | Dataset split to use | `validation` |
80
+ | `--concurrency` | Number of concurrent requests | `4` |
81
+ | `--n-examples` | Number of examples to process (`-1` for all) | `-1` |
82
+ | `--output` | Path to save results as JSON | `None` |
83
+ | `--show-predictions` | Display sample predictions | `False` |
84
+ | `--print-n` | Number of samples to display | `5` |
85
+ | `--api-type` | API to use: `chat` (chat completions) or `transcription` (audio transcriptions) | `chat` |
86
+ | `--language` | Language for transcription API (e.g., `English`, `en`) | `None` |
87
+ | `--stream` | Enable streaming mode for transcription API | `False` |
88
+
89
+ ## Metrics
90
+
91
+ The benchmark outputs:
92
+
93
+ | Metric | Description |
94
+ |--------|-------------|
95
+ | **Total Requests** | Number of successful ASR requests processed |
96
+ | **WER** | Word Error Rate (lower is better), computed using the `evaluate` library |
97
+ | **Average Latency** | Mean time per request (seconds) |
98
+ | **Median Latency** | 50th percentile latency (seconds) |
99
+ | **95th Latency** | 95th percentile latency (seconds) |
100
+ | **Throughput** | Requests processed per second |
101
+ | **Token Throughput** | Output tokens per second |
102
+
103
+ ## Example Output
104
+
105
+ ```bash
106
+ python bench_sglang.py --api-type transcription --concurrency 128 --model openai/whisper-large-v3 --show-predictions
107
+
108
+ Loading dataset: D4nt3/esb-datasets-earnings22-validation-tiny-filtered...
109
+ Using API type: transcription
110
+ Repo card metadata block was not found. Setting CardData to empty.
111
+ WARNING:huggingface_hub.repocard:Repo card metadata block was not found. Setting CardData to empty.
112
+ Performing warmup...
113
+ Processing 511 samples...
114
+ ------------------------------
115
+ Results for openai/whisper-large-v3:
116
+ Total Requests: 511
117
+ WER: 12.7690
118
+ Average Latency: 1.3602s
119
+ Median Latency: 1.2090s
120
+ 95th Latency: 2.9986s
121
+ Throughput: 19.02 req/s
122
+ Token Throughput: 354.19 tok/s
123
+ Total Test Time: 26.8726s
124
+ ------------------------------
125
+
126
+ ==================== Sample Predictions ====================
127
+ Sample 1:
128
+ REF: on the use of taxonomy i you know i think it is it is early days for us to to make any clear indications to the market about the proportion that would fall under that requirement
129
+ PRED: on the eu taxonomy i think it is early days for us to make any clear indications to the market about the proportion that would fall under that requirement
130
+ ----------------------------------------
131
+ Sample 2:
132
+ REF: so within fiscal year 2021 say 120 a 100 depending on what the micro will do and next year it is not necessarily payable in q one is we will look at what the cash flows for 2022 look like
133
+ PRED: so within fiscal year 2021 say $120000 $100000 depending on what the macro will do and next year it is not necessarily payable in q one is we will look at what the cash flows for 2022 look like
134
+ ----------------------------------------
135
+ Sample 3:
136
+ REF: we talked about 4.7 gigawatts
137
+ PRED: we talked about 4.7 gigawatts
138
+ ----------------------------------------
139
+ Sample 4:
140
+ REF: and you know depending on that working capital build we will we will see what that yields
141
+ PRED: and depending on that working capital build we will see what that yields what
142
+ ----------------------------------------
143
+ Sample 5:
144
+ REF: so on on sinopec what we have agreed with sinopec way back then is that free cash flows after paying all capexs are distributed out 30 70%
145
+ PRED: so on sinopec what we have agreed with sinopec way back then is that free cash flows after paying all capexes are distributed out 30% 70%
146
+ ----------------------------------------
147
+ ============================================================
148
+ ```
149
+
150
+ ## Notes
151
+
152
+ - Audio samples longer than 30 seconds are automatically filtered out (Whisper limitation)
153
+ - The benchmark performs a warmup request before measuring performance
154
+ - Results are normalized using the model's tokenizer when available
155
+ - When using `--stream` with `--show-predictions`, use `--concurrency 1` for clean sequential output
156
+ - The `--language` option accepts both full names (e.g., `English`) and ISO 639-1 codes (e.g., `en`)
157
+
158
+ ## Troubleshooting
159
+
160
+ **Server connection refused**
161
+ - Ensure the SGLang server is running and accessible at the specified `--base-url`
162
+ - Check that the port is not blocked by a firewall
163
+
164
+ **Out of memory errors**
165
+ - Reduce `--concurrency` to lower GPU memory usage
166
+ - Use a smaller Whisper model variant
sglang/benchmark/asr/bench_sglang.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import asyncio
3
+ import base64
4
+ import io
5
+ import json
6
+ import time
7
+ from statistics import mean, median
8
+
9
+ import httpx
10
+ import librosa
11
+ import numpy as np
12
+ import soundfile
13
+ from datasets import load_dataset
14
+ from evaluate import load
15
+ from openai import AsyncOpenAI, OpenAI
16
+ from transformers import AutoTokenizer
17
+
18
+
19
+ def to_bytes(y, sr):
20
+ buffer = io.BytesIO()
21
+ soundfile.write(buffer, y, sr, format="WAV")
22
+ buffer.seek(0)
23
+ return buffer
24
+
25
+
26
+ async def run_asr_chat(client, model_name, y, sr):
27
+ """Use chat completions API with audio_url for ASR."""
28
+ with to_bytes(y, sr) as f:
29
+ audio_bytes = f.read()
30
+ audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
31
+
32
+ start_time = time.perf_counter()
33
+ response = await client.chat.completions.create(
34
+ model=model_name,
35
+ messages=[
36
+ {
37
+ "role": "user",
38
+ "content": [
39
+ {
40
+ "type": "audio_url",
41
+ "audio_url": {"url": f"data:audio/wav;base64,{audio_base64}"},
42
+ }
43
+ ],
44
+ }
45
+ ],
46
+ temperature=0.0,
47
+ )
48
+ end_time = time.perf_counter()
49
+
50
+ asr_text = response.choices[0].message.content
51
+ latency = end_time - start_time
52
+ return latency, asr_text
53
+
54
+
55
+ def run_asr_transcription_sync(client, model_name, y, sr, language=None):
56
+ """Use audio transcriptions API for ASR (sync version)."""
57
+ audio_buffer = to_bytes(y, sr)
58
+ audio_buffer.name = "audio.wav" # OpenAI client needs a name attribute
59
+
60
+ start_time = time.perf_counter()
61
+ kwargs = {
62
+ "model": model_name,
63
+ "file": audio_buffer,
64
+ }
65
+ if language:
66
+ kwargs["language"] = language
67
+
68
+ transcription = client.audio.transcriptions.create(**kwargs)
69
+ end_time = time.perf_counter()
70
+
71
+ latency = end_time - start_time
72
+ return latency, transcription.text
73
+
74
+
75
+ def run_asr_transcription_stream_sync(
76
+ base_url, model_name, y, sr, language=None, show_stream=False
77
+ ):
78
+ """Use audio transcriptions API with streaming for ASR."""
79
+ audio_buffer = to_bytes(y, sr)
80
+ audio_bytes = audio_buffer.read()
81
+
82
+ data = {
83
+ "model": model_name,
84
+ "response_format": "json",
85
+ "stream": "true",
86
+ }
87
+ if language:
88
+ data["language"] = language
89
+
90
+ start_time = time.perf_counter()
91
+ text_chunks = []
92
+
93
+ if show_stream:
94
+ print("[STREAM] ", end="", flush=True)
95
+
96
+ with httpx.stream(
97
+ "POST",
98
+ f"{base_url}/v1/audio/transcriptions",
99
+ data=data,
100
+ files={"file": ("audio.wav", audio_bytes, "audio/wav")},
101
+ timeout=60.0,
102
+ ) as response:
103
+ for line in response.iter_lines():
104
+ if line.startswith("data: ") and not line.startswith("data: [DONE]"):
105
+ try:
106
+ chunk = json.loads(line[6:])
107
+ if "choices" in chunk and chunk["choices"]:
108
+ delta = chunk["choices"][0].get("delta", {})
109
+ content = delta.get("content", "")
110
+ if content:
111
+ text_chunks.append(content)
112
+ if show_stream:
113
+ print(content, end="", flush=True)
114
+ except json.JSONDecodeError:
115
+ pass
116
+
117
+ if show_stream:
118
+ print() # newline after stream
119
+
120
+ end_time = time.perf_counter()
121
+ latency = end_time - start_time
122
+ return latency, "".join(text_chunks)
123
+
124
+
125
+ async def run_asr_transcription(
126
+ client,
127
+ model_name,
128
+ y,
129
+ sr,
130
+ language=None,
131
+ stream=False,
132
+ base_url=None,
133
+ show_stream=False,
134
+ ):
135
+ """Async wrapper for transcription API (runs sync call in executor)."""
136
+ loop = asyncio.get_event_loop()
137
+ if stream:
138
+ return await loop.run_in_executor(
139
+ None,
140
+ run_asr_transcription_stream_sync,
141
+ base_url,
142
+ model_name,
143
+ y,
144
+ sr,
145
+ language,
146
+ show_stream,
147
+ )
148
+ return await loop.run_in_executor(
149
+ None, run_asr_transcription_sync, client, model_name, y, sr, language
150
+ )
151
+
152
+
153
+ async def bound_asr(
154
+ sem,
155
+ client,
156
+ model_name,
157
+ tokenizer,
158
+ audio,
159
+ reference,
160
+ api_type="chat",
161
+ language=None,
162
+ stream=False,
163
+ base_url=None,
164
+ show_stream=False,
165
+ ):
166
+ async with sem:
167
+ try:
168
+ if api_type == "transcription":
169
+ latency, text = await run_asr_transcription(
170
+ client,
171
+ model_name,
172
+ *audio,
173
+ language=language,
174
+ stream=stream,
175
+ base_url=base_url,
176
+ show_stream=show_stream,
177
+ )
178
+ else:
179
+ latency, text = await run_asr_chat(client, model_name, *audio)
180
+
181
+ # Calculate tokens for throughput metrics
182
+ num_output_tokens = len(tokenizer(text, add_special_tokens=False).input_ids)
183
+
184
+ # Normalize for WER evaluation
185
+ # Whisper tokenizer has a normalize method
186
+ if hasattr(tokenizer, "normalize"):
187
+ out = tokenizer.normalize(text)
188
+ ref = tokenizer.normalize(reference)
189
+ else:
190
+ out = text.lower().strip()
191
+ ref = reference.lower().strip()
192
+
193
+ return latency, num_output_tokens, out, ref
194
+ except Exception as e:
195
+ print(f"Error during ASR: {e}")
196
+ return None
197
+
198
+
199
+ async def process_dataset(
200
+ model_name,
201
+ client,
202
+ data,
203
+ concurrent_request,
204
+ api_type="chat",
205
+ language=None,
206
+ stream=False,
207
+ base_url=None,
208
+ show_predictions=False,
209
+ ):
210
+ sem = asyncio.Semaphore(concurrent_request)
211
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
212
+
213
+ # Warmup
214
+ print("Performing warmup...")
215
+ audio_warmup, sr_warmup = (
216
+ data[0]["audio"]["array"],
217
+ data[0]["audio"]["sampling_rate"],
218
+ )
219
+ await bound_asr(
220
+ sem,
221
+ client,
222
+ model_name,
223
+ tokenizer,
224
+ (audio_warmup, sr_warmup),
225
+ "",
226
+ api_type=api_type,
227
+ language=language,
228
+ stream=stream,
229
+ base_url=base_url,
230
+ show_stream=False, # Don't show stream during warmup
231
+ )
232
+
233
+ tasks = []
234
+ print(f"Processing {len(data)} samples...")
235
+ for sample in data:
236
+ audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
237
+ tasks.append(
238
+ asyncio.create_task(
239
+ bound_asr(
240
+ sem,
241
+ client,
242
+ model_name,
243
+ tokenizer,
244
+ (audio, sr),
245
+ sample["text"],
246
+ api_type=api_type,
247
+ language=language,
248
+ stream=stream,
249
+ base_url=base_url,
250
+ show_stream=show_predictions and stream,
251
+ )
252
+ )
253
+ )
254
+
255
+ results = await asyncio.gather(*tasks)
256
+ return [r for r in results if r is not None]
257
+
258
+
259
+ def run_evaluation(args):
260
+ # Use sync client for transcription API, async for chat API
261
+ if args.api_type == "transcription":
262
+ client = OpenAI(base_url=f"{args.base_url}/v1", api_key="None")
263
+ else:
264
+ client = AsyncOpenAI(base_url=f"{args.base_url}/v1", api_key="None")
265
+
266
+ print(f"Loading dataset: {args.dataset}...")
267
+ print(f"Using API type: {args.api_type}" + (f" (streaming)" if args.stream else ""))
268
+ dataset = load_dataset(args.dataset, split=args.split)
269
+
270
+ # Filter by duration if needed (Whisper max is 30s)
271
+ def add_duration(sample):
272
+ y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
273
+ sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
274
+ return sample
275
+
276
+ if "duration_ms" not in dataset.column_names:
277
+ dataset = dataset.map(add_duration)
278
+
279
+ dataset = dataset.filter(lambda x: x["duration_ms"] < 30000)
280
+
281
+ if args.n_examples > 0:
282
+ dataset = dataset.select(range(min(args.n_examples, len(dataset))))
283
+
284
+ start = time.perf_counter()
285
+ results = asyncio.run(
286
+ process_dataset(
287
+ args.model,
288
+ client,
289
+ dataset,
290
+ args.concurrency,
291
+ api_type=args.api_type,
292
+ language=args.language,
293
+ stream=args.stream,
294
+ base_url=args.base_url,
295
+ show_predictions=args.show_predictions,
296
+ )
297
+ )
298
+ total_test_time = time.perf_counter() - start
299
+
300
+ if not results:
301
+ print("No successful results to evaluate.")
302
+ return
303
+
304
+ # Metrics
305
+ latencies = [res[0] for res in results]
306
+ total_tokens = sum([res[1] for res in results])
307
+ predictions = [res[2] for res in results]
308
+ references = [res[3] for res in results]
309
+
310
+ wer_metric = load("wer")
311
+ wer_score = 100 * wer_metric.compute(references=references, predictions=predictions)
312
+
313
+ print("-" * 30)
314
+ print(f"Results for {args.model}:")
315
+ print(f"Total Requests: {len(results)}")
316
+ print(f"WER: {wer_score:.4f}")
317
+ print(f"Average Latency: {mean(latencies):.4f}s")
318
+ print(f"Median Latency: {median(latencies):.4f}s")
319
+ print(f"95th Latency: {np.percentile(latencies, 95):.4f}s")
320
+ print(f"Throughput: {len(results) / total_test_time:.2f} req/s")
321
+ print(f"Token Throughput: {total_tokens / total_test_time:.2f} tok/s")
322
+ print(f"Total Test Time: {total_test_time:.4f}s")
323
+ print("-" * 30)
324
+
325
+ if args.output:
326
+ with open(args.output, "w") as f:
327
+ import json
328
+
329
+ json.dump(
330
+ {
331
+ "model": args.model,
332
+ "dataset": args.dataset,
333
+ "wer": wer_score,
334
+ "avg_latency": mean(latencies),
335
+ "throughput": len(results) / total_test_time,
336
+ "token_throughput": total_tokens / total_test_time,
337
+ },
338
+ f,
339
+ indent=2,
340
+ )
341
+
342
+ if args.show_predictions:
343
+ print("\n" + "=" * 20 + " Sample Predictions " + "=" * 20)
344
+ num_to_show = min(args.print_n, len(results))
345
+ for i in range(num_to_show):
346
+ print(f"Sample {i+1}:")
347
+ print(f" REF: {references[i]}")
348
+ print(f" PRED: {predictions[i]}")
349
+ print("-" * 40)
350
+ print("=" * 60)
351
+
352
+
353
+ if __name__ == "__main__":
354
+ parser = argparse.ArgumentParser(description="Benchmark sGLang ASR performance.")
355
+ parser.add_argument(
356
+ "--base-url", default="http://localhost:30000", help="sGLang server base URL"
357
+ )
358
+ parser.add_argument(
359
+ "--model", default="openai/whisper-large-v3", help="Model name on the server"
360
+ )
361
+ parser.add_argument(
362
+ "--dataset",
363
+ default="D4nt3/esb-datasets-earnings22-validation-tiny-filtered",
364
+ help="HF dataset repo",
365
+ )
366
+ parser.add_argument("--split", default="validation", help="Dataset split")
367
+ parser.add_argument(
368
+ "--concurrency", type=int, default=4, help="Number of concurrent requests"
369
+ )
370
+ parser.add_argument(
371
+ "--n-examples",
372
+ "-n",
373
+ type=int,
374
+ default=-1,
375
+ help="Number of examples to test (-1 for all)",
376
+ )
377
+ parser.add_argument("--output", help="Path to save results in JSON")
378
+ parser.add_argument(
379
+ "--show-predictions",
380
+ action="store_true",
381
+ help="Print sample predictions and references",
382
+ )
383
+ parser.add_argument(
384
+ "--print-n", type=int, default=5, help="Number of sample predictions to print"
385
+ )
386
+ parser.add_argument(
387
+ "--api-type",
388
+ choices=["chat", "transcription"],
389
+ default="chat",
390
+ help="API type to use: 'chat' for chat completions with audio_url, 'transcription' for audio.transcriptions API",
391
+ )
392
+ parser.add_argument(
393
+ "--language",
394
+ default=None,
395
+ help="Language code for transcription API (e.g., 'en')",
396
+ )
397
+ parser.add_argument(
398
+ "--stream",
399
+ action="store_true",
400
+ help="Use streaming mode for transcription API",
401
+ )
402
+ args = parser.parse_args()
403
+
404
+ run_evaluation(args)
sglang/benchmark/bench_attention_sink/bench_attention_sink_triton.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import torch
4
+ import triton
5
+
6
+ from sglang.srt.layers.attention.triton_ops.decode_attention import (
7
+ decode_attention_fwd_grouped,
8
+ )
9
+ from sglang.srt.layers.attention.triton_ops.extend_attention import extend_attention_fwd
10
+
11
+ # gpt oss
12
+ head_num = 64
13
+ head_dim = 64
14
+ head_kv_num = 8
15
+
16
+
17
+ @triton.testing.perf_report(
18
+ triton.testing.Benchmark(
19
+ x_names=["S"], # sequence length on x-axis
20
+ x_vals=[128, 256, 512, 1024, 2048, 4096],
21
+ x_log=True,
22
+ line_arg="B", # batch size as different lines
23
+ line_vals=[1, 8, 32, 128],
24
+ line_names=["B=1", "B=8", "B=32", "B=128"],
25
+ styles=[
26
+ ("blue", "-"),
27
+ ("green", "-"),
28
+ ("red", "-"),
29
+ ("cyan", "-"),
30
+ ],
31
+ ylabel="TFLOPS",
32
+ plot_name="attention-sink-triton-decode",
33
+ args={},
34
+ )
35
+ )
36
+ def benchmark_decode(B, S, H_Q, H_KV, D):
37
+ D_V = D
38
+ dtype = torch.bfloat16
39
+ seq_len = S
40
+ total_tokens = B * seq_len
41
+ device = torch.device("cuda")
42
+ sm_scale = 1.0 / (D**0.5)
43
+ max_kv_splits = 8
44
+ num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
45
+
46
+ # q represents the new token being generated, one per batch
47
+ q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
48
+
49
+ # k_buffer and v_buffer represent all previous tokens
50
+ k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
51
+ v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
52
+
53
+ o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
54
+
55
+ b_seq_len = torch.full((B,), seq_len, device="cuda")
56
+
57
+ kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
58
+ kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len, dim=0)
59
+ kv_indices = torch.arange(total_tokens, device="cuda")
60
+
61
+ attn_logits1 = torch.empty(
62
+ (B, H_Q, max_kv_splits, D_V),
63
+ dtype=torch.float32,
64
+ device="cuda",
65
+ )
66
+ attn_lse1 = torch.empty(
67
+ (B, H_Q, max_kv_splits, D_V),
68
+ dtype=torch.float32,
69
+ device="cuda",
70
+ )
71
+ sink = torch.randn(H_Q, device=device, dtype=torch.float32)
72
+
73
+ # warmup
74
+ for _ in range(5):
75
+ decode_attention_fwd_grouped(
76
+ q,
77
+ k_buffer,
78
+ v_buffer,
79
+ o,
80
+ kv_indptr,
81
+ kv_indices,
82
+ attn_logits1,
83
+ attn_lse1,
84
+ num_kv_splits,
85
+ max_kv_splits,
86
+ sm_scale,
87
+ logit_cap=0.0,
88
+ sinks=sink,
89
+ )
90
+
91
+ # benchmark
92
+ run_step = 500
93
+ start_event = torch.cuda.Event(enable_timing=True)
94
+ end_event = torch.cuda.Event(enable_timing=True)
95
+ start_event.record()
96
+ for _ in range(run_step):
97
+ decode_attention_fwd_grouped(
98
+ q,
99
+ k_buffer,
100
+ v_buffer,
101
+ o,
102
+ kv_indptr,
103
+ kv_indices,
104
+ attn_logits1,
105
+ attn_lse1,
106
+ num_kv_splits,
107
+ max_kv_splits,
108
+ sm_scale,
109
+ logit_cap=0.0,
110
+ sinks=sink,
111
+ )
112
+ end_event.record()
113
+ end_event.synchronize()
114
+ torch.cuda.synchronize()
115
+ ms = start_event.elapsed_time(end_event) / run_step
116
+ tflops = lambda ms: (2 * B * S * H_Q * D) * 1e-9 / ms # must be causal
117
+ return tflops(ms)
118
+
119
+
120
+ @triton.testing.perf_report(
121
+ triton.testing.Benchmark(
122
+ x_names=["S"], # sequence length on x-axis
123
+ x_vals=[128, 256, 512, 1024, 2048, 4096],
124
+ x_log=True,
125
+ line_arg="B", # batch size as different lines
126
+ line_vals=[1, 8, 32, 128],
127
+ line_names=["B=1", "B=8", "B=32", "B=128"],
128
+ styles=[
129
+ ("blue", "-"),
130
+ ("green", "-"),
131
+ ("red", "-"),
132
+ ("cyan", "-"),
133
+ ],
134
+ ylabel="TFLOPS",
135
+ plot_name="attention-sink-triton-extend",
136
+ args={},
137
+ )
138
+ )
139
+ def benchmark_extend(B, S, H_Q, H_KV, D):
140
+ # S here represents N_CTX from the test
141
+ dtype = torch.bfloat16
142
+ device = "cuda"
143
+
144
+ # Split S into prefix and extend lengths
145
+ prefill_len = S // 2 # Similar to test's N_CTX // 2
146
+ extend_len = S // 4 # Make extend length smaller than prefix
147
+
148
+ # Calculate total tokens and extend tokens
149
+ total_extend_tokens = B * extend_len
150
+ total_prefix_tokens = B * prefill_len
151
+
152
+ # Create query, key, value tensors for extension
153
+ q_extend = torch.randn(total_extend_tokens, H_Q, D, dtype=dtype, device=device)
154
+ k_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
155
+ v_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
156
+ o_extend = torch.empty_like(q_extend)
157
+
158
+ # Create key-value buffers for prefix
159
+ k_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
160
+ v_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
161
+
162
+ # Create index pointers
163
+ qo_indptr = torch.arange(0, (B + 1) * extend_len, extend_len, device=device).to(
164
+ torch.int32
165
+ )
166
+ kv_indptr = torch.arange(0, (B + 1) * prefill_len, prefill_len, device=device).to(
167
+ torch.int32
168
+ )
169
+ kv_indices = torch.arange(0, total_prefix_tokens, device=device).to(torch.int32)
170
+
171
+ sm_scale = 1.0 / (D**0.5)
172
+ # sliding_window = 128 # From GPT-OSS config, skip for now
173
+ sliding_window = -1
174
+
175
+ sink = torch.randn(H_Q, device=device, dtype=torch.float32)
176
+
177
+ # warmup
178
+ for _ in range(5):
179
+ extend_attention_fwd(
180
+ q_extend,
181
+ k_extend,
182
+ v_extend,
183
+ o_extend,
184
+ k_buffer,
185
+ v_buffer,
186
+ qo_indptr,
187
+ kv_indptr,
188
+ kv_indices,
189
+ custom_mask=None,
190
+ is_causal=True,
191
+ mask_indptr=None,
192
+ max_len_extend=extend_len,
193
+ sm_scale=sm_scale,
194
+ sliding_window_size=sliding_window,
195
+ sinks=sink,
196
+ )
197
+
198
+ # benchmark
199
+ run_step = 500
200
+ start_event = torch.cuda.Event(enable_timing=True)
201
+ end_event = torch.cuda.Event(enable_timing=True)
202
+ start_event.record()
203
+ for _ in range(run_step):
204
+ extend_attention_fwd(
205
+ q_extend,
206
+ k_extend,
207
+ v_extend,
208
+ o_extend,
209
+ k_buffer,
210
+ v_buffer,
211
+ qo_indptr,
212
+ kv_indptr,
213
+ kv_indices,
214
+ custom_mask=None,
215
+ is_causal=True,
216
+ mask_indptr=None,
217
+ max_len_extend=extend_len,
218
+ sm_scale=sm_scale,
219
+ sliding_window_size=sliding_window,
220
+ sinks=sink,
221
+ )
222
+ end_event.record()
223
+ end_event.synchronize()
224
+ torch.cuda.synchronize()
225
+ ms = start_event.elapsed_time(end_event) / run_step
226
+
227
+ # FLOPS calculation: each attention operation requires 2 multiplications per element
228
+ total_flops = 2 * total_extend_tokens * H_Q * (prefill_len + extend_len / 2) * D
229
+ tflops = lambda ms: total_flops * 1e-12 / (ms * 1e-3) # convert to TFLOPS
230
+ return tflops(ms)
231
+
232
+
233
+ if __name__ == "__main__":
234
+ parser = argparse.ArgumentParser()
235
+ parser.add_argument("--bench", type=str, default="all", help="all, extend, decode")
236
+ args = parser.parse_args()
237
+
238
+ kwargs = {
239
+ "H_Q": head_num,
240
+ "H_KV": head_kv_num,
241
+ "D": head_dim,
242
+ }
243
+
244
+ if args.bench in ["all", "decode"]:
245
+ benchmark_decode.run(print_data=True, show_plots=False, **kwargs)
246
+
247
+ if args.bench in ["all", "extend"]:
248
+ benchmark_extend.run(print_data=True, show_plots=False, **kwargs)
249
+
250
+ print("Benchmark finished!")
sglang/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmark with lots of common prefixes. Used to benchmark prefix caching performance.
2
+ #
3
+ # Launch a server:
4
+ # python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --log-level-http warning
5
+
6
+ import random
7
+ import string
8
+ import time
9
+
10
+ from tqdm import tqdm
11
+ from transformers import AutoTokenizer
12
+
13
+ import sglang as sgl
14
+ from sglang import set_default_backend
15
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
16
+
17
+
18
+ def generate_random_string(token_length: int) -> str:
19
+ random_string = "".join(
20
+ random.choices(string.ascii_letters + string.digits, k=token_length * 100)
21
+ )
22
+ tokenized_output = tokenizer.encode(random_string, add_special_tokens=False)[
23
+ :token_length
24
+ ]
25
+
26
+ if len(tokenized_output) < token_length:
27
+ tokenized_output = tokenized_output + [tokenizer.pad_token_id] * (
28
+ token_length - len(tokenized_output)
29
+ )
30
+
31
+ decoded_string = tokenizer.decode(tokenized_output, skip_special_tokens=False)
32
+ return decoded_string
33
+
34
+
35
+ def generate_unique_prefix(base_text, index):
36
+ return str(index) + base_text[len(str(index)) :]
37
+
38
+
39
+ @sgl.function
40
+ def text_qa(s, question, gen_len):
41
+ s += "Q: " + question + "\n"
42
+ s += "A:" + sgl.gen("answer", stop="\n", temperature=0, max_tokens=gen_len)
43
+
44
+
45
+ def prepare_prompts(num_prefix, num_samples_per_prefix, prefix_length, suffix_length):
46
+ base_prefix = generate_random_string(prefix_length)
47
+
48
+ tot_input_len = 0
49
+ all_prompts = []
50
+ for i in tqdm(range(num_prefix), desc="prepare prompts"):
51
+ unique_prefix = generate_unique_prefix(base_prefix, i)
52
+ prompt_list = []
53
+ for j in range(num_samples_per_prefix):
54
+ suffix = generate_random_string(suffix_length)
55
+ prompt = unique_prefix + suffix
56
+ prompt_list.append(prompt)
57
+ tot_input_len += len(tokenizer.encode(prompt))
58
+ all_prompts.append(prompt_list)
59
+ return all_prompts, tot_input_len
60
+
61
+
62
+ def test_batch_by_batch(all_prompts, gen_len):
63
+ backend.flush_cache()
64
+
65
+ tot_time = 0
66
+ for i in range(len(all_prompts)):
67
+ tic = time.perf_counter()
68
+ text_qa.run_batch(
69
+ list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
70
+ )
71
+ tot_time += time.perf_counter() - tic
72
+
73
+ return tot_time
74
+
75
+
76
+ def test_batch_by_batch_with_hint(all_prompts, gen_len):
77
+ backend.flush_cache()
78
+
79
+ tot_time = 0
80
+ for i in range(len(all_prompts)):
81
+ tic = time.perf_counter()
82
+ # Send a hint to cache the prefix
83
+ text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
84
+ # Send the batch
85
+ text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
86
+
87
+ tot_time += time.perf_counter() - tic
88
+
89
+ return tot_time
90
+
91
+
92
+ def test_send_all(all_prompts, gen_len):
93
+ backend.flush_cache()
94
+
95
+ all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
96
+
97
+ tic = time.perf_counter()
98
+ text_qa.run_batch(
99
+ list(zip(all_prompts, [gen_len] * len(all_prompts))),
100
+ )
101
+ tot_time = time.perf_counter() - tic
102
+
103
+ return tot_time
104
+
105
+
106
+ if __name__ == "__main__":
107
+ tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
108
+ backend = RuntimeEndpoint("http://127.0.0.1:30000")
109
+ set_default_backend(backend)
110
+
111
+ random.seed(0)
112
+ num_prefix = 10
113
+ num_samples_per_prefix = 32
114
+ prefix_length = 1024
115
+ suffix_length = 128
116
+ gen_len = 1
117
+ all_prompts, tot_input_len = prepare_prompts(
118
+ num_prefix, num_samples_per_prefix, prefix_length, suffix_length
119
+ )
120
+
121
+ print(f"Total input token length: {tot_input_len}\n")
122
+
123
+ cost = test_batch_by_batch(all_prompts, gen_len)
124
+ print(f"Latency of test_batch_by_batch : {cost:.4f} s\n")
125
+
126
+ cost = test_batch_by_batch_with_hint(all_prompts, gen_len)
127
+ print(f"Latency of test_batch_by_batch_with_hint: {cost:.4f} s\n")
128
+
129
+ cost = test_send_all(all_prompts, gen_len)
130
+ print(f"Latency of test_send_all : {cost:.4f} s\n")
sglang/benchmark/bench_rope/benchmark_rope_index.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script benchmarks MRotaryEmbedding.get_rope_index_glm4v (GLM4V mrope index builder).
2
+ # It generates synthetic multimodal input_ids + attention_mask (+ optional image/video grids),
3
+ # runs benchmarks.
4
+ #
5
+ # == Usage Examples ==
6
+ #
7
+ # python3 benchmark_rope_index.py --device cuda --num-tokens 1024 2048 --benchmark-iter 200
8
+
9
+ import argparse
10
+ import math
11
+ import time
12
+ from dataclasses import dataclass, field
13
+ from typing import Any
14
+
15
+ import numpy as np
16
+ import torch
17
+
18
+ from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
19
+
20
+
21
+ # -----------------------------
22
+ # Minimal config objects
23
+ # -----------------------------
24
+ @dataclass
25
+ class DummyVisionConfig:
26
+ spatial_merge_size: int = 2
27
+
28
+
29
+ @dataclass
30
+ class DummyHFConfig:
31
+ image_token_id: int = 32000
32
+ video_start_token_id: int = 32001
33
+ video_end_token_id: int = 32002
34
+ vision_config: DummyVisionConfig = field(
35
+ default_factory=lambda: DummyVisionConfig(spatial_merge_size=2)
36
+ )
37
+
38
+
39
+ # -----------------------------
40
+ # Helpers
41
+ # -----------------------------
42
+ def calculate_stats(times: list[float]) -> dict[str, float]:
43
+ """Calculate statistics from a list of times."""
44
+ times_array = np.array(times, dtype=np.float64)
45
+ return {
46
+ "mean": float(np.mean(times_array)),
47
+ "median": float(np.median(times_array)),
48
+ "p99": float(np.percentile(times_array, 99)),
49
+ "min": float(np.min(times_array)),
50
+ "max": float(np.max(times_array)),
51
+ }
52
+
53
+
54
+ def _sync(device: torch.device):
55
+ if device.type == "cuda":
56
+ torch.cuda.synchronize()
57
+
58
+
59
+ def _approx_hw(patches: int, merge: int) -> tuple[int, int]:
60
+ # want (h/merge)*(w/merge) ~= patches
61
+ gh = int(math.sqrt(max(1, patches)))
62
+ gw = max(1, patches // max(1, gh))
63
+ return gh * merge, gw * merge
64
+
65
+
66
+ def generate_test_data(
67
+ num_tokens: int,
68
+ batch_size: int,
69
+ hf_config: DummyHFConfig,
70
+ dtype: torch.dtype,
71
+ device: torch.device,
72
+ pad_ratio: float,
73
+ num_images_per_sample: int,
74
+ image_patch_tokens: int,
75
+ num_videos_per_sample: int,
76
+ video_patch_tokens: int,
77
+ seed: int,
78
+ ):
79
+ """
80
+ Generate synthetic (input_ids, attention_mask, image_grid_thw, video_grid_thw).
81
+
82
+ NOTE:
83
+ - image_grid_thw / video_grid_thw are global lists across the entire batch in encounter order,
84
+ matching the function's image_index/video_index behavior.
85
+ - image patches are represented by repeated image_token_id.
86
+ - video patches are represented by image_token_id wrapped with start/end tokens.
87
+ """
88
+ torch.manual_seed(seed)
89
+
90
+ forbidden = {
91
+ 0,
92
+ hf_config.image_token_id,
93
+ hf_config.video_start_token_id,
94
+ hf_config.video_end_token_id,
95
+ }
96
+ vocab_size = 50000
97
+
98
+ def rand_text(n: int) -> torch.Tensor:
99
+ # generate random ids not in forbidden
100
+ out = torch.randint(1, vocab_size, (n,), device=device, dtype=torch.long)
101
+ # fix forbidden by +1 until ok (cheap, deterministic enough for benchmark data)
102
+ for bad in forbidden:
103
+ out = torch.where(out == bad, out + 1, out)
104
+ return out
105
+
106
+ image_grids: list[list[int]] = []
107
+ video_grids: list[list[int]] = []
108
+
109
+ input_ids = torch.zeros((batch_size, num_tokens), device=device, dtype=torch.long)
110
+ attention_mask = torch.zeros(
111
+ (batch_size, num_tokens), device=device, dtype=torch.long
112
+ )
113
+
114
+ eff_len = int(round(num_tokens * (1.0 - pad_ratio)))
115
+ eff_len = max(1, min(num_tokens, eff_len))
116
+
117
+ min_needed = 1
118
+ min_needed += num_images_per_sample * image_patch_tokens
119
+ min_needed += num_videos_per_sample * (2 + video_patch_tokens)
120
+ if eff_len < min_needed:
121
+ num_images_per_sample = 0
122
+ num_videos_per_sample = 0
123
+
124
+ for b in range(batch_size):
125
+ blocks: list[torch.Tensor] = []
126
+
127
+ reserved = (
128
+ num_images_per_sample * image_patch_tokens
129
+ + num_videos_per_sample * (2 + video_patch_tokens)
130
+ )
131
+ reserved = min(reserved, max(0, eff_len - 1))
132
+ text_budget = max(1, eff_len - reserved)
133
+
134
+ n_text_chunks = num_images_per_sample + num_videos_per_sample + 1
135
+ base = text_budget // n_text_chunks
136
+ rem = text_budget % n_text_chunks
137
+ text_chunks = [base + (1 if i < rem else 0) for i in range(n_text_chunks)]
138
+
139
+ tci = 0
140
+ for _ in range(num_images_per_sample):
141
+ blocks.append(rand_text(text_chunks[tci]))
142
+ tci += 1
143
+ blocks.append(
144
+ torch.full(
145
+ (image_patch_tokens,),
146
+ hf_config.image_token_id,
147
+ device=device,
148
+ dtype=torch.long,
149
+ )
150
+ )
151
+
152
+ h, w = _approx_hw(
153
+ image_patch_tokens, hf_config.vision_config.spatial_merge_size
154
+ )
155
+ image_grids.append([1, h, w])
156
+
157
+ for _ in range(num_videos_per_sample):
158
+ blocks.append(rand_text(text_chunks[tci]))
159
+ tci += 1
160
+ blocks.append(
161
+ torch.tensor(
162
+ [hf_config.video_start_token_id], device=device, dtype=torch.long
163
+ )
164
+ )
165
+ blocks.append(
166
+ torch.full(
167
+ (video_patch_tokens,),
168
+ hf_config.image_token_id,
169
+ device=device,
170
+ dtype=torch.long,
171
+ )
172
+ )
173
+ blocks.append(
174
+ torch.tensor(
175
+ [hf_config.video_end_token_id], device=device, dtype=torch.long
176
+ )
177
+ )
178
+
179
+ h, w = _approx_hw(
180
+ video_patch_tokens, hf_config.vision_config.spatial_merge_size
181
+ )
182
+ # first field = group count used by code; set to 1
183
+ video_grids.append([1, h, w])
184
+
185
+ blocks.append(rand_text(text_chunks[tci]))
186
+
187
+ tokens = torch.cat(blocks, dim=0)[:eff_len]
188
+ pad = torch.zeros(
189
+ (num_tokens - tokens.numel(),), device=device, dtype=torch.long
190
+ )
191
+ ids = torch.cat([tokens, pad], dim=0)
192
+
193
+ mask = torch.cat(
194
+ [
195
+ torch.ones((tokens.numel(),), device=device, dtype=torch.long),
196
+ torch.zeros(
197
+ (num_tokens - tokens.numel(),), device=device, dtype=torch.long
198
+ ),
199
+ ],
200
+ dim=0,
201
+ )
202
+
203
+ input_ids[b] = ids
204
+ attention_mask[b] = mask
205
+
206
+ image_grid_thw = (
207
+ torch.tensor(image_grids, device=device, dtype=torch.long)
208
+ if len(image_grids)
209
+ else None
210
+ )
211
+ video_grid_thw = (
212
+ torch.tensor(video_grids, device=device, dtype=torch.long)
213
+ if len(video_grids)
214
+ else None
215
+ )
216
+ return (
217
+ input_ids.to(dtype=torch.long),
218
+ attention_mask.to(dtype=torch.long),
219
+ image_grid_thw,
220
+ video_grid_thw,
221
+ )
222
+
223
+
224
+ def benchmark_rope_index(
225
+ model_name: str,
226
+ tp_size: int,
227
+ num_tokens: int,
228
+ batch_size: int,
229
+ pad_ratio: float,
230
+ spatial_merge_size: int,
231
+ num_images: int,
232
+ image_patch_tokens: int,
233
+ num_videos: int,
234
+ video_patch_tokens: int,
235
+ dtype: torch.dtype,
236
+ seed: int,
237
+ warmup_iter: int,
238
+ benchmark_iter: int,
239
+ device: torch.device,
240
+ ):
241
+ torch.manual_seed(seed)
242
+ hf_config = DummyHFConfig(
243
+ image_token_id=32000,
244
+ video_start_token_id=32001,
245
+ video_end_token_id=32002,
246
+ vision_config=DummyVisionConfig(spatial_merge_size=spatial_merge_size),
247
+ )
248
+
249
+ print(80 * "=")
250
+ print(
251
+ f"Evaluating: {model_name} tp_size={tp_size} "
252
+ f"num_tokens={num_tokens} batch={batch_size} pad_ratio={pad_ratio} "
253
+ f"images/sample={num_images} image_patch_tokens={image_patch_tokens} "
254
+ f"videos/sample={num_videos} video_patch_tokens={video_patch_tokens} "
255
+ f"dtype={dtype} device={device}"
256
+ )
257
+
258
+ input_ids, attention_mask, image_grid_thw, video_grid_thw = generate_test_data(
259
+ num_tokens=num_tokens,
260
+ batch_size=batch_size,
261
+ hf_config=hf_config,
262
+ dtype=dtype,
263
+ device=device,
264
+ pad_ratio=pad_ratio,
265
+ num_images_per_sample=num_images,
266
+ image_patch_tokens=image_patch_tokens,
267
+ num_videos_per_sample=num_videos,
268
+ video_patch_tokens=video_patch_tokens,
269
+ seed=seed,
270
+ )
271
+
272
+ # Smoke test
273
+ has_mm = (image_grid_thw is not None) or (video_grid_thw is not None)
274
+ if has_mm:
275
+ pos, delta = MRotaryEmbedding.get_rope_index_glm4v(
276
+ input_ids=input_ids,
277
+ hf_config=hf_config,
278
+ image_grid_thw=image_grid_thw,
279
+ video_grid_thw=video_grid_thw,
280
+ attention_mask=attention_mask,
281
+ )
282
+ assert pos.shape == (3, batch_size, num_tokens)
283
+ assert delta.shape == (batch_size, 1)
284
+
285
+ # Warm up
286
+ for _ in range(warmup_iter):
287
+ if has_mm:
288
+ MRotaryEmbedding.get_rope_index_glm4v(
289
+ input_ids=input_ids,
290
+ hf_config=hf_config,
291
+ image_grid_thw=image_grid_thw,
292
+ video_grid_thw=video_grid_thw,
293
+ attention_mask=attention_mask,
294
+ )
295
+ MRotaryEmbedding.get_rope_index_glm4v(
296
+ input_ids=input_ids,
297
+ hf_config=hf_config,
298
+ image_grid_thw=None,
299
+ video_grid_thw=None,
300
+ attention_mask=attention_mask,
301
+ )
302
+
303
+ _sync(device)
304
+
305
+ # Time multimodal branch
306
+ multimodal_times = []
307
+ for _ in range(benchmark_iter):
308
+ _sync(device)
309
+ start = time.time()
310
+ MRotaryEmbedding.get_rope_index_glm4v(
311
+ input_ids=input_ids,
312
+ hf_config=hf_config,
313
+ image_grid_thw=image_grid_thw,
314
+ video_grid_thw=video_grid_thw,
315
+ attention_mask=attention_mask,
316
+ )
317
+ _sync(device)
318
+ multimodal_times.append(time.time() - start)
319
+
320
+ # Time fallback branch
321
+ fallback_times = []
322
+ for _ in range(benchmark_iter):
323
+ _sync(device)
324
+ start = time.time()
325
+ MRotaryEmbedding.get_rope_index_glm4v(
326
+ input_ids=input_ids,
327
+ hf_config=hf_config,
328
+ image_grid_thw=None,
329
+ video_grid_thw=None,
330
+ attention_mask=attention_mask,
331
+ )
332
+ _sync(device)
333
+ fallback_times.append(time.time() - start)
334
+
335
+ multimodal_stats = calculate_stats(multimodal_times)
336
+ fallback_stats = calculate_stats(fallback_times)
337
+
338
+ print(f"\nPerformance for config (B={batch_size}, T={num_tokens}):")
339
+ print(
340
+ f"Multimodal: mean={multimodal_stats['mean']:.8f}s, "
341
+ f"median={multimodal_stats['median']:.8f}s, "
342
+ f"p99={multimodal_stats['p99']:.8f}s"
343
+ )
344
+ print(
345
+ f"Fallback: mean={fallback_stats['mean']:.8f}s, "
346
+ f"median={fallback_stats['median']:.8f}s, "
347
+ f"p99={fallback_stats['p99']:.8f}s"
348
+ )
349
+
350
+ if has_mm:
351
+ speedup = (
352
+ multimodal_stats["mean"] / fallback_stats["mean"]
353
+ if fallback_stats["mean"] > 0
354
+ else float("inf")
355
+ )
356
+ print(f"Fallback Speedup over Multimodal: {speedup:.8f}x")
357
+ else:
358
+ speedup = float("nan")
359
+ print(
360
+ "[INFO] num_tokens too small for multimodal segments; skip multimodal benchmark."
361
+ )
362
+
363
+ print(f"Fallback Speedup over Multimodal: {speedup:.8f}x")
364
+
365
+ return multimodal_stats, fallback_stats, speedup
366
+
367
+
368
+ if __name__ == "__main__":
369
+ parser = argparse.ArgumentParser(
370
+ description="Benchmark GLM4V get_rope_index_glm4v."
371
+ )
372
+ parser.add_argument("--model-name", type=str, default="GLM4V")
373
+ parser.add_argument("--tp-size", type=int, default=1)
374
+ parser.add_argument(
375
+ "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu"
376
+ )
377
+ parser.add_argument("--warmup-iter", type=int, default=10)
378
+ parser.add_argument("--benchmark-iter", type=int, default=100)
379
+ parser.add_argument("--dtype", type=str, choices=["int64"], default="int64")
380
+ parser.add_argument("--seed", type=int, default=0)
381
+
382
+ # token length sweep
383
+ parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
384
+
385
+ # data shape knobs
386
+ parser.add_argument("--batch-size", type=int, default=1)
387
+ parser.add_argument("--pad-ratio", type=float, default=0.0)
388
+ parser.add_argument("--spatial-merge-size", type=int, default=2)
389
+ parser.add_argument("--num-images", type=int, default=1)
390
+ parser.add_argument("--image-patch-tokens", type=int, default=256)
391
+ parser.add_argument("--num-videos", type=int, default=1)
392
+ parser.add_argument("--video-patch-tokens", type=int, default=256)
393
+
394
+ # output
395
+ parser.add_argument("--out-dir", type=str, default=".")
396
+ args = parser.parse_args()
397
+ print(args)
398
+
399
+ device = torch.device(args.device)
400
+
401
+ if args.num_tokens is None:
402
+ num_tokens_list = [2**i for i in range(0, 18)]
403
+ else:
404
+ num_tokens_list = args.num_tokens
405
+
406
+ rows: list[dict[str, Any]] = []
407
+
408
+ for num_tokens in num_tokens_list:
409
+ multimodal_stats, fallback_stats, speedup = benchmark_rope_index(
410
+ model_name=args.model_name,
411
+ tp_size=args.tp_size,
412
+ num_tokens=num_tokens,
413
+ batch_size=args.batch_size,
414
+ pad_ratio=args.pad_ratio,
415
+ spatial_merge_size=args.spatial_merge_size,
416
+ num_images=args.num_images,
417
+ image_patch_tokens=args.image_patch_tokens,
418
+ num_videos=args.num_videos,
419
+ video_patch_tokens=args.video_patch_tokens,
420
+ dtype=getattr(torch, args.dtype),
421
+ seed=args.seed,
422
+ warmup_iter=args.warmup_iter,
423
+ benchmark_iter=args.benchmark_iter,
424
+ device=device,
425
+ )
sglang/benchmark/benchmark_batch/benchmark_batch.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import concurrent.futures
2
+ import os
3
+ import random
4
+ import time
5
+ from concurrent.futures import ProcessPoolExecutor
6
+ from statistics import mean
7
+
8
+ import requests
9
+ from tqdm import tqdm
10
+ from transformers import AutoTokenizer
11
+
12
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
13
+
14
+ ###############################################################################
15
+ # CONFIG
16
+ ###############################################################################
17
+ ENDPOINT_URL = "http://127.0.0.1:30000"
18
+ TOKENIZER_DIR = "/models/meta-llama/Llama-3.2-3B"
19
+
20
+ # Benchmark configurations
21
+ NUM_REQUESTS = 10 # Total number of requests (each with BATCH_SIZE prompts)
22
+ NUM_TOKENS = 32000 # Tokens per prompt
23
+ BATCH_SIZE = 8 # Number of prompts per request
24
+ GEN_TOKENS = 0 # Tokens to generate per prompt
25
+
26
+
27
+ ###############################################################################
28
+ # REQUEST GENERATION (in parallel)
29
+ ###############################################################################
30
+ def generate_random_prompt(index, tokenizer_dir, num_tokens):
31
+ """Generate a single random prompt with specified token count."""
32
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
33
+ vocab_size = tokenizer.vocab_size
34
+
35
+ def generate_random_text(num_toks):
36
+ random_token_ids = [random.randint(0, vocab_size - 1) for _ in range(num_toks)]
37
+ return tokenizer.decode(random_token_ids, clean_up_tokenization_spaces=True)
38
+
39
+ random_text = generate_random_text(num_tokens)
40
+ return f"Prompt {index}: {random_text}"
41
+
42
+
43
+ def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_dir):
44
+ """Generate prompts for all requests in parallel."""
45
+ total_prompts = num_requests * batch_size
46
+ all_prompts = [None] * total_prompts
47
+ max_workers = min(os.cpu_count() or 1, total_prompts)
48
+
49
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
50
+ futures = [
51
+ executor.submit(generate_random_prompt, i, tokenizer_dir, num_tokens)
52
+ for i in range(total_prompts)
53
+ ]
54
+ for future in tqdm(
55
+ concurrent.futures.as_completed(futures),
56
+ total=total_prompts,
57
+ desc="Generating prompts",
58
+ ):
59
+ index = futures.index(future)
60
+ all_prompts[index] = future.result()
61
+
62
+ batched_prompts = [
63
+ all_prompts[i * batch_size : (i + 1) * batch_size] for i in range(num_requests)
64
+ ]
65
+
66
+ print(
67
+ f"Generated {total_prompts} prompts with {num_tokens} tokens each, grouped into {num_requests} requests of {batch_size} prompts.\n"
68
+ )
69
+ return batched_prompts
70
+
71
+
72
+ ###############################################################################
73
+ # HTTP CALLS
74
+ ###############################################################################
75
+ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
76
+ """Send a batch of prompts to the /generate endpoint synchronously."""
77
+ sampling_params = {
78
+ "max_new_tokens": gen_tokens,
79
+ "temperature": 0.7,
80
+ "stop": "\n",
81
+ }
82
+ data = {"text": prompts, "sampling_params": sampling_params}
83
+
84
+ start_time = time.perf_counter()
85
+ try:
86
+ response = requests.post(
87
+ endpoint.base_url + "/generate", json=data, timeout=3600
88
+ )
89
+ if response.status_code != 200:
90
+ error = response.json()
91
+ raise RuntimeError(f"Request {request_id} failed: {error}")
92
+ result = response.json()
93
+ elapsed_time = (time.perf_counter() - start_time) * 1000 # Convert to ms
94
+ avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
95
+ return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
96
+ except Exception as e:
97
+ print(f"[Request] Error for request {request_id}: {e}")
98
+ return request_id, 0, 0, False, len(prompts)
99
+
100
+
101
+ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
102
+ """Run the benchmark sequentially."""
103
+ results = []
104
+ num_requests = len(batched_prompts)
105
+
106
+ # Record start time for total latency
107
+ benchmark_start_time = time.perf_counter()
108
+
109
+ for i, batch_prompts in enumerate(batched_prompts):
110
+ request_id = i + 1
111
+ assert (
112
+ len(batch_prompts) == batch_size
113
+ ), f"Request {request_id} should have {batch_size} prompts, got {len(batch_prompts)}"
114
+
115
+ print(
116
+ f"[Request] Sending request {request_id}/{num_requests} with {len(batch_prompts)} prompts at {int(time.time()*1000)}"
117
+ )
118
+ result = send_batch_request(endpoint, batch_prompts, gen_tokens, request_id)
119
+ results.append(result)
120
+
121
+ # Calculate total latency
122
+ total_latency = (time.perf_counter() - benchmark_start_time) * 1000 # Convert to ms
123
+
124
+ return results, total_latency
125
+
126
+
127
+ ###############################################################################
128
+ # RESULTS
129
+ ###############################################################################
130
+ def process_results(results, total_latency, num_requests):
131
+ """Process and display benchmark results."""
132
+ total_time = 0
133
+ successful_requests = 0
134
+ failed_requests = 0
135
+ request_latencies = []
136
+ per_prompt_latencies = []
137
+ total_prompts = 0
138
+
139
+ for request_id, elapsed_time, avg_per_prompt, success, batch_size in results:
140
+ if success:
141
+ successful_requests += 1
142
+ total_prompts += batch_size
143
+ request_latencies.append(elapsed_time)
144
+ per_prompt_latencies.append(avg_per_prompt)
145
+ total_time += elapsed_time / 1000 # Convert to seconds
146
+ else:
147
+ failed_requests += 1
148
+
149
+ avg_request_latency = mean(request_latencies) if request_latencies else 0
150
+ avg_per_prompt_latency = mean(per_prompt_latencies) if per_prompt_latencies else 0
151
+ throughput = total_prompts / total_time if total_time > 0 else 0
152
+
153
+ print("\nBenchmark Summary:")
154
+ print(f" Total requests sent: {len(results)}")
155
+ print(f" Total prompts sent: {total_prompts}")
156
+ print(f" Successful requests: {successful_requests}")
157
+ print(f" Failed requests: {failed_requests}")
158
+ print(f" Total latency (all requests): {total_latency:.2f} ms")
159
+ print(f" Avg per request latency: {avg_request_latency:.2f} ms")
160
+ print(f" Avg per prompt latency: {avg_per_prompt_latency:.2f} ms")
161
+ print(f" Throughput: {throughput:.2f} prompts/second\n")
162
+
163
+
164
+ ###############################################################################
165
+ # MAIN
166
+ ###############################################################################
167
+ def main():
168
+ # Initialize endpoint
169
+ endpoint = RuntimeEndpoint(ENDPOINT_URL)
170
+
171
+ # Generate prompts
172
+ batched_prompts = prepare_all_prompts(
173
+ NUM_REQUESTS, BATCH_SIZE, NUM_TOKENS, TOKENIZER_DIR
174
+ )
175
+
176
+ # Flush cache before benchmark
177
+ # endpoint.flush_cache()
178
+
179
+ # Run benchmark
180
+ print(
181
+ f"Starting benchmark: NUM_TOKENS={NUM_TOKENS}, BATCH_SIZE={BATCH_SIZE}, NUM_REQUESTS={NUM_REQUESTS}\n"
182
+ )
183
+ results, total_latency = run_benchmark(
184
+ endpoint, batched_prompts, BATCH_SIZE, GEN_TOKENS
185
+ )
186
+
187
+ # Process and display results
188
+ process_results(results, total_latency, NUM_REQUESTS)
189
+
190
+
191
+ if __name__ == "__main__":
192
+ random.seed(0)
193
+ main()
sglang/benchmark/benchmark_batch/benchmark_tokenizer.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import random
3
+ import time
4
+ from statistics import mean
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ from sglang.srt.utils.patch_tokenizer import patch_tokenizer
9
+
10
+
11
+ def main():
12
+ args = parse_args()
13
+
14
+ print("Tokenizer Benchmark: Sequential vs Batch Processing")
15
+ print("-" * 60)
16
+ print(f"Tokenizer: {args.tokenizer}")
17
+ print(f"Functions: {', '.join(args.function)}")
18
+ print(f"Tokens per prompt: {args.num_tokens}")
19
+ print(f"Number of runs per batch size: {args.num_runs}")
20
+ print(f"Batch mode: {', '.join(args.batch_mode)}")
21
+ print("-" * 60)
22
+
23
+ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
24
+ tokenizer = patch_tokenizer(tokenizer)
25
+ max_batch_size = max(args.batch_sizes)
26
+
27
+ token_ids = generate_random_token_ids(
28
+ num_prompts=max_batch_size, num_tokens=args.num_tokens, tokenizer=tokenizer
29
+ )
30
+
31
+ if "encode" in args.function:
32
+ prompts = [
33
+ tokenizer.decode(ids, clean_up_tokenization_spaces=True)
34
+ for ids in token_ids
35
+ ]
36
+ run_benchmark(
37
+ name="encode",
38
+ data=prompts,
39
+ sequential_fn=lambda batch: [tokenizer.encode(p) for p in batch],
40
+ batch_fn=lambda batch: tokenizer(batch),
41
+ batch_sizes=args.batch_sizes,
42
+ num_runs=args.num_runs,
43
+ batch_mode=args.batch_mode,
44
+ )
45
+
46
+ if "decode" in args.function:
47
+ # mimic DetokenizerManager's usual case
48
+ decode_kwargs = dict(
49
+ skip_special_tokens=True,
50
+ spaces_between_special_tokens=True,
51
+ )
52
+ run_benchmark(
53
+ name="decode",
54
+ data=token_ids,
55
+ sequential_fn=lambda batch: [
56
+ tokenizer.decode(ids, **decode_kwargs) for ids in batch
57
+ ],
58
+ batch_fn=lambda batch: tokenizer.batch_decode(batch, **decode_kwargs),
59
+ batch_sizes=args.batch_sizes,
60
+ num_runs=args.num_runs,
61
+ batch_mode=args.batch_mode,
62
+ )
63
+
64
+
65
+ def run_benchmark(
66
+ *, name, data, sequential_fn, batch_fn, batch_sizes, num_runs, batch_mode
67
+ ):
68
+ print("\n" + "=" * 60)
69
+ print(f"{name.upper()} BENCHMARK")
70
+ print("=" * 60)
71
+
72
+ results = [
73
+ benchmark(
74
+ data=data,
75
+ batch_size=bs,
76
+ sequential_fn=sequential_fn,
77
+ batch_fn=batch_fn,
78
+ num_runs=num_runs,
79
+ batch_mode=batch_mode,
80
+ )
81
+ for bs in batch_sizes
82
+ ]
83
+ print_results(results=results, func_name=name, batch_mode=batch_mode)
84
+
85
+
86
+ def benchmark(*, data, batch_size, sequential_fn, batch_fn, num_runs, batch_mode):
87
+ batch_data = data[:batch_size]
88
+ run_single = "single" in batch_mode
89
+ run_batch = "batch" in batch_mode
90
+
91
+ out = {"batch_size": batch_size}
92
+
93
+ if run_single:
94
+ sequential_times = measure_times(
95
+ fn=lambda: sequential_fn(batch_data), num_runs=num_runs
96
+ )
97
+ out |= {
98
+ "avg_sequential_ms": mean(sequential_times),
99
+ "sequential_runs": sequential_times,
100
+ }
101
+
102
+ if run_batch:
103
+ batch_times = measure_times(fn=lambda: batch_fn(batch_data), num_runs=num_runs)
104
+ out |= {
105
+ "avg_batch_ms": mean(batch_times),
106
+ "batch_runs": batch_times,
107
+ }
108
+
109
+ if run_single and run_batch:
110
+ out["speedup_factor"] = (
111
+ out["avg_sequential_ms"] / out["avg_batch_ms"]
112
+ if out["avg_batch_ms"] > 0
113
+ else 0
114
+ )
115
+
116
+ return out
117
+
118
+
119
+ def print_results(*, results, func_name, batch_mode):
120
+ run_single = "single" in batch_mode
121
+ run_batch = "batch" in batch_mode
122
+
123
+ for r in results:
124
+ print(f"\nBatch size: {r['batch_size']}")
125
+ if run_single:
126
+ print_runs(
127
+ label=f"Sequential {func_name}",
128
+ runs=r["sequential_runs"],
129
+ avg=r["avg_sequential_ms"],
130
+ )
131
+ if run_batch:
132
+ print_runs(
133
+ label=f"Batch {func_name}", runs=r["batch_runs"], avg=r["avg_batch_ms"]
134
+ )
135
+ if run_single and run_batch:
136
+ print(f" Speedup factor: {r['speedup_factor']:.2f}x")
137
+
138
+ print("\n" + "=" * 60)
139
+ print(f"SUMMARY: {func_name.upper()}")
140
+ print("=" * 60)
141
+
142
+ headers = ["Batch Size"]
143
+ if run_single:
144
+ headers.append("Sequential (ms)")
145
+ if run_batch:
146
+ headers.append("Batch (ms)")
147
+ if run_single and run_batch:
148
+ headers.append("Speedup")
149
+ print("".join(f"{h:<18}" for h in headers))
150
+ print("-" * (18 * len(headers)))
151
+
152
+ for r in results:
153
+ row = [f"{r['batch_size']}"]
154
+ if run_single:
155
+ row.append(f"{r['avg_sequential_ms']:.2f} ms")
156
+ if run_batch:
157
+ row.append(f"{r['avg_batch_ms']:.2f} ms")
158
+ if run_single and run_batch:
159
+ row.append(f"{r['speedup_factor']:.2f}x")
160
+ print("".join(f"{v:<18}" for v in row))
161
+
162
+
163
+ def print_runs(*, label, runs, avg):
164
+ print(f" {label}:")
165
+ for i, t in enumerate(runs):
166
+ print(f" Run {i+1}: {t:.2f} ms")
167
+ print(f" Average: {avg:.2f} ms")
168
+
169
+
170
+ def measure_times(*, fn, num_runs):
171
+ times = []
172
+ for _ in range(num_runs):
173
+ start = time.perf_counter()
174
+ fn()
175
+ times.append((time.perf_counter() - start) * 1000)
176
+ return times
177
+
178
+
179
+ def generate_random_token_ids(*, num_prompts, num_tokens, tokenizer):
180
+ vocab_size = tokenizer.vocab_size
181
+ print(f"Generating {num_prompts} random sequences with {num_tokens} tokens each...")
182
+ return [
183
+ [random.randint(0, vocab_size - 1) for _ in range(num_tokens)]
184
+ for _ in range(num_prompts)
185
+ ]
186
+
187
+
188
+ def parse_args():
189
+ parser = argparse.ArgumentParser(
190
+ description="Tokenizer Benchmark: Sequential vs Batch Processing"
191
+ )
192
+ parser.add_argument(
193
+ "--tokenizer",
194
+ type=str,
195
+ required=True,
196
+ help="Tokenizer name or path (e.g. nvidia/Kimi-K2-Thinking-NVFP4)",
197
+ )
198
+ parser.add_argument(
199
+ "--function",
200
+ type=str,
201
+ nargs="+",
202
+ choices=["encode", "decode"],
203
+ default=["encode", "decode"],
204
+ help="Functions to benchmark (default: encode decode)",
205
+ )
206
+ parser.add_argument(
207
+ "--num-tokens",
208
+ type=int,
209
+ default=20000,
210
+ help="Number of tokens per prompt (default: 20000)",
211
+ )
212
+ parser.add_argument(
213
+ "--batch-sizes",
214
+ type=int,
215
+ nargs="+",
216
+ default=[1, 2, 4, 8],
217
+ help="Batch sizes to test (default: 1 2 4 8)",
218
+ )
219
+ parser.add_argument(
220
+ "--batch-mode",
221
+ nargs="+",
222
+ choices=["single", "batch"],
223
+ default=["single", "batch"],
224
+ help="Benchmark modes to run (default: single batch)",
225
+ )
226
+ parser.add_argument(
227
+ "--num-runs",
228
+ type=int,
229
+ default=5,
230
+ help="Number of runs per batch size (default: 5)",
231
+ )
232
+ return parser.parse_args()
233
+
234
+
235
+ if __name__ == "__main__":
236
+ random.seed(0)
237
+ main()
sglang/benchmark/benchmark_vllm_060/README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## How to reproduce the benchmark results for SGLang v0.3.0 compared to vLLM v0.6.0
2
+
3
+ In short, with multi step enabled, in online scenarios that we benchmarked, the Median TTFT of vLLM is **3 times** that of SGLang, and the Median ITL is **10 times** that of SGLang. Lower Median TTFT and ITL are better. vLLM's multi-step optimization did not improve throughput while ensuring lower Median TTFT and ITL. Also, under maximum throughput benchmark, if vLLM does not set gpu util to 0.95 separately and uses the default configuration instead, its maximum throughput is **lower** than that of SGLang.
4
+
5
+ ## Online benchmark results
6
+
7
+ ### Llama 3.1 8B Instruct 1 x A100 80G
8
+
9
+ | RPS | Num prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
10
+ |------|-------------|--------|--------------------|-------------|-------------|------------|
11
+ | 4 | 1200 | SGLang | 1564.17 | **31.98** | 13.17 | **11.93** |
12
+ | 4 | 1200 | vLLM | 1691.97 | **100.48** | 14.14 | **129.32** |
13
+ | 8 | 2400 | SGLang | 2175.02 | **35.68** | 17.85 | **14.41** |
14
+ | 8 | 2400 | vLLM | 2137.16 | **120.39** | 17.09 | **158.63** |
15
+
16
+ ### Llama 3.1 70B Insruct 4 x H100 80G
17
+
18
+ | RPS | Num Prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
19
+ |------|-------------|--------|--------------------|-------------|-------------|------------|
20
+ | 4 | 1200 | SGLang | 3005.24 | **53.94** | 25.03 | **21.67** |
21
+ | 4 | 1200 | vLLM | 2915.60 | **179.15** | 23.58 | **231.23** |
22
+ | 8 | 2400 | SGLang | 4064.98 | **58.11** | 33.07 | **24.45** |
23
+ | 8 | 2400 | vLLM | 3752.38 | **207.12** | 29.15 | **275.32** |
24
+
25
+ ## Offline benchmark results
26
+
27
+ ### Llama 3.1 8B Instruct 1 x A100 80G
28
+
29
+ | RPS | Num Prompts | Engine | Request throughput | Output token throughput |
30
+ |------|-------------|--------|--------------------|-------------------------|
31
+ | inf | 5000 | SGLang | 22.03 | **4281.51** |
32
+ | inf | 5000 | vLLM | 21.27 | **4132.37** |
33
+
34
+ ### Llama 3.1 70B Insruct 4 x H100 80G
35
+
36
+ | RPS | Num Prompts | Engine | Request throughput | Output token throughput |
37
+ |------|-------------|--------|--------------------|-------------------------|
38
+ | inf | 5000 | SGLang | 19.84 | **3856.01** |
39
+ | inf | 5000 | vLLM | 19.04 | **3700.64** |
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ # install sglang v0.3.0
45
+ pip install --upgrade pip
46
+ pip install "sglang[all]"==0.3.0
47
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
48
+
49
+ # install vllm v0.6.0
50
+ pip install vllm==0.6.0
51
+ ```
52
+
53
+ ## Notes
54
+
55
+ We referred to the reproduction method in https://github.com/vllm-project/vllm/issues/8176, and added the `--num-scheduler-steps 10` parameter when starting the vLLM server. The `gpu_memory_utilization` of vLLM is by default 0.9 at both TP 1 and TP 4, while SGLang's `mem_frac` is 0.88 at TP 1 and 0.85 at TP 4, so we manually set it to 0.88 at TP 4.
56
+
57
+ ## Online benchmarks
58
+
59
+ ```bash
60
+ # Llama 3.1 8B Instruct on 1 x A100
61
+ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
62
+ python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
63
+
64
+ # Llama 3.1 70B Instruct on 4 x H100
65
+ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
66
+ python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
67
+
68
+ # bench serving
69
+ python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4
70
+ python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 2400 --request-rate 8
71
+ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 1200 --request-rate 4
72
+ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 2400 --request-rate 8
73
+ ```
74
+
75
+ ## Offline benchmarks
76
+
77
+ ```bash
78
+ # Llama 3.1 8B Instruct on 1 x A100
79
+ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
80
+ python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
81
+
82
+ # Llama 3.1 70B Instruct on 4 x H100
83
+ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
84
+ python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
85
+
86
+ # bench serving
87
+ python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000
88
+ python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 5000
89
+ ```
sglang/benchmark/blog_v0_2/405b_sglang.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Create dummy weights:
2
+ # 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
3
+ # 2. Get `config.json`` from ./config.md
4
+ # 3. Download the tokenizer
5
+ # wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
6
+ # wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
7
+
8
+ # Launch sglang
9
+ # python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quantization fp8 --disable-radix --mem-frac 0.87
10
+
11
+ # offline
12
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
13
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
14
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
15
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
16
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
17
+ python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
18
+
19
+ # online
20
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
21
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
22
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
23
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
24
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
sglang/benchmark/blog_v0_2/405b_trt.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Launch trtllm
2
+ # https://github.com/sgl-project/tensorrt-demo
3
+
4
+ # offline
5
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log11
6
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log12
7
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log13
8
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log14
9
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log15
10
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 2000 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log21
11
+
12
+ # online
13
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log31
14
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log32
15
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log33
16
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log34
17
+ python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log35
sglang/benchmark/blog_v0_2/405b_vllm.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Create dummy weights:
2
+ # 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
3
+ # 2. Get `config.json`` from ./config.md
4
+ # 3. Download the tokenizer
5
+ # wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
6
+ # wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
7
+
8
+ # Launch vllm
9
+ # python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
10
+
11
+ # offline
12
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > vllm_log11
13
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > vllm_log12
14
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > vllm_log13
15
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > vllm_log14
16
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > vllm_log15
17
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 2000 > vllm_log21
18
+
19
+ # online
20
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > vllm_log31
21
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
22
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
23
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
24
+ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35
sglang/benchmark/blog_v0_2/README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to reproduce the benchmark results of SGLang
2
+
3
+ ## Prerequisite
4
+
5
+ ### Install the latest SGLang
6
+
7
+ ```bash
8
+ git clone https://github.com/sgl-project/sglang.git
9
+ cd sglang
10
+ git checkout v0.2.7
11
+
12
+ pip install --upgrade pip
13
+ pip install -e "python[all]"
14
+
15
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
16
+ ```
17
+
18
+ ### Set up ulimit and HF_TOKEN
19
+
20
+ ```bash
21
+ ulimit -n 65535
22
+ # Change the token to a real and usable one, with access permissions for the Llama 3 models.
23
+ export HF_TOKEN=hf_token
24
+ ```
25
+
26
+ ### Launch the server
27
+
28
+ ```bash
29
+ # Meta-Llama-3.1-8B-Instruct
30
+ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
31
+
32
+ # Meta-Llama-3.1-70B-Instruct
33
+ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
34
+
35
+ # Meta-Llama-3-70B-Instruct-FP8
36
+ python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8
37
+ ```
38
+
39
+ ## Benchmark
40
+
41
+ ### Hardware Requirements
42
+
43
+ - 8B models: Single NVIDIA A100 80GB GPU
44
+ - 70B models: 8 x NVIDIA A100 80GB GPUs with Tensor Parallelism (TP) 8
45
+ - 70B FP8 models: 8 x NVIDIA H100 GPUs with Tensor Parallelism (TP) 8
46
+
47
+ Please ensure you have the appropriate hardware before running the benchmarks.
48
+
49
+ #### Offline benchmark
50
+
51
+ ```bash
52
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
53
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
54
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
55
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
56
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
57
+ python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
58
+ cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1
59
+ ```
60
+
61
+ #### Online benchmark
62
+
63
+ ```bash
64
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
65
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
66
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
67
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
68
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
69
+ cat online.jsonl | cut -d':' -f9 | cut -d',' -f1
70
+ ```
71
+
72
+ ## Other
73
+
74
+ We tried using vLLM 0.5.3.post1, but it often crashes under high loads, and it seems to have similar or worse performance compared to vLLM 0.5.2 from our partial benchmarking, so we are using the older version, vLLM 0.5.2.
75
+
76
+ Preparation for TensorRT LLM can refer to https://github.com/sgl-project/tensorrt-demo. Specifically, we used a batch size of 512, a max input length of 8192, and a max number of tokens of 8192. The instance count for preprocessing and postprocessing in Triton Server is 16.
77
+
78
+ ```bash
79
+ # vLLM
80
+ pip install vllm==0.5.2
81
+ pip install jsonschema==4.21.1
82
+
83
+ # Meta-Llama-3-8B-Instruct
84
+ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B-Instruct --disable-log-requests
85
+
86
+ # meta-llama/Meta-Llama-3-70B-Instruct
87
+ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B-Instruct --disable-log-requests --tensor 8
88
+
89
+ # neuralmagic/Meta-Llama-3-70B-Instruct-FP8
90
+ python -m vllm.entrypoints.openai.api_server --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-log-requests --tensor 8
91
+ ```
92
+
93
+ ```bash
94
+ wget https://raw.githubusercontent.com/sgl-project/sglang/main/python/sglang/bench_serving.py
95
+ ```
96
+
97
+ ```bash
98
+ # vLLM Offline
99
+
100
+ python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_vllm.jsonl
101
+ python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_vllm.jsonl
102
+ python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_vllm.jsonl
103
+ python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_vllm.jsonl
104
+ python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_vllm.jsonl
105
+ python3 bench_serving.py --backend vllm --dataset-name sharegpt --num-prompts 3000 --output-file offline_vllm.jsonl
106
+ cat offline_vllm.jsonl | cut -d':' -f12 | cut -d',' -f1
107
+ ```
108
+
109
+ ```bash
110
+ # vLLM Online
111
+
112
+ python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_vllm.jsonl
113
+ python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_vllm.jsonl
114
+ python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_vllm.jsonl
115
+ python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_vllm.jsonl
116
+ python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_vllm.jsonl
117
+ cat online_vllm.jsonl | cut -d':' -f9 | cut -d',' -f1
118
+ ```
119
+
120
+ ```bash
121
+ # TensorRT LLM Offline 8B
122
+
123
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_8b.jsonl
124
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_8b.jsonl
125
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_8b.jsonl
126
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_8b.jsonl
127
+ python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_8b.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
128
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_8b.jsonl
129
+ cat offline_trt_8b.jsonl | cut -d':' -f12 | cut -d',' -f1
130
+ ```
131
+
132
+ ```bash
133
+ # TensorRT LLM Online 8B
134
+
135
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_8b.jsonl
136
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_8b.jsonl
137
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_8b.jsonl
138
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_8b.jsonl
139
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_8b.jsonl
140
+ cat online_trt_8b.jsonl | cut -d':' -f9 | cut -d',' -f1
141
+ ```
142
+
143
+ ```bash
144
+ # TensorRT LLM Offline 70B
145
+
146
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_70b.jsonl
147
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_70b.jsonl
148
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_70b.jsonl
149
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_70b.jsonl
150
+ python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_70b.jsonl --model meta-llama/Meta-Llama-3-70B-Instruct
151
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_70b.jsonl
152
+ cat offline_trt_70b.jsonl | cut -d':' -f12 | cut -d',' -f1
153
+ ```
154
+
155
+ ```bash
156
+ # TensorRT LLM Online 70B
157
+
158
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_70b.jsonl
159
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_70b.jsonl
160
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_70b.jsonl
161
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_70b.jsonl
162
+ python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_70b.jsonl
163
+ cat online_trt_70b.jsonl | cut -d':' -f9 | cut -d',' -f1
164
+ ```
sglang/benchmark/blog_v0_2/config.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### used for TensorRT LLM
2
+
3
+ ```
4
+ {
5
+ "architecture": "LlamaForCausalLM",
6
+ "dtype": "float16",
7
+ "logits_dtype": "float32",
8
+ "vocab_size": 128256,
9
+ "max_position_embeddings": 8192,
10
+ "hidden_size": 16384,
11
+ "num_hidden_layers": 126,
12
+ "num_attention_heads": 128,
13
+ "num_key_value_heads": 16,
14
+ "head_size": 128,
15
+ "qk_layernorm": false,
16
+ "hidden_act": "silu",
17
+ "intermediate_size": 53248,
18
+ "norm_epsilon": 1e-05,
19
+ "position_embedding_type": "rope_gpt_neox",
20
+ "use_parallel_embedding": false,
21
+ "embedding_sharding_dim": 0,
22
+ "share_embedding_table": false,
23
+ "mapping": {
24
+ "world_size": 8,
25
+ "tp_size": 8,
26
+ "pp_size": 1,
27
+ "gpus_per_node": 8
28
+ },
29
+ "quantization": {
30
+ "quant_algo": "FP8",
31
+ "kv_cache_quant_algo": null,
32
+ "group_size": 128,
33
+ "smoothquant_val": null,
34
+ "has_zero_point": false,
35
+ "pre_quant_scale": false,
36
+ "exclude_modules": [
37
+ "lm_head"
38
+ ]
39
+ },
40
+ "kv_dtype": "float16",
41
+ "rotary_scaling": null,
42
+ "residual_mlp": false,
43
+ "moe_normalization_mode": null,
44
+ "rotary_base": 500000.0,
45
+ "moe_num_experts": 0,
46
+ "moe_top_k": 0,
47
+ "moe_tp_mode": 2,
48
+ "attn_bias": false,
49
+ "disable_weight_only_quant_plugin": false,
50
+ "mlp_bias": false
51
+ }
52
+ ```
53
+
54
+ ### used for vLLM and SGLang
55
+
56
+ ```
57
+ {
58
+ "_name_or_path": "dummy_fp8",
59
+ "architectures": [
60
+ "LlamaForCausalLM"
61
+ ],
62
+ "attention_bias": false,
63
+ "attention_dropout": 0.0,
64
+ "bos_token_id": 128000,
65
+ "eos_token_id": 128009,
66
+ "hidden_act": "silu",
67
+ "hidden_size": 16384,
68
+ "initializer_range": 0.02,
69
+ "intermediate_size": 53248,
70
+ "mlp_bias": false,
71
+ "model_type": "llama",
72
+ "num_attention_heads": 128,
73
+ "num_hidden_layers": 126,
74
+ "num_key_value_heads": 8,
75
+ "pretraining_tp": 1,
76
+ "quantization_config": {
77
+ "activation_scheme": "static",
78
+ "ignored_layers": [
79
+ "lm_head"
80
+ ],
81
+ "quant_method": "fp8"
82
+ },
83
+ "rope_scaling": {
84
+ "factor": 8.0,
85
+ "low_freq_factor": 1.0,
86
+ "high_freq_factor": 4.0,
87
+ "original_max_position_embeddings": 8192,
88
+ "rope_type": "llama3"
89
+ },
90
+ "max_position_embeddings": 131072,
91
+ "rms_norm_eps": 1e-05,
92
+ "rope_scaling": null,
93
+ "rope_theta": 500000.0,
94
+ "tie_word_embeddings": false,
95
+ "torch_dtype": "bfloat16",
96
+ "transformers_version": "4.41.1",
97
+ "use_cache": true,
98
+ "vocab_size": 128256
99
+ }
100
+ ```
sglang/benchmark/boolq/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Download data
2
+ ```
3
+ git clone https://hf-mirror.com/datasets/google/boolq
4
+ ```
5
+
6
+ ## Convert parquet to json
7
+ ```
8
+ bash parquet_to_json.sh
9
+ ```
10
+ ## Run benchmark
11
+
12
+ ### Benchmark sglang
13
+ ```
14
+ python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
15
+ ```
16
+
17
+ ```
18
+ python3 bench_sglang.py
19
+ ```
sglang/benchmark/boolq/bench_sglang.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import time
4
+
5
+ import numpy as np
6
+
7
+ from sglang.api import set_default_backend
8
+ from sglang.test.test_utils import (
9
+ add_common_sglang_args_and_parse,
10
+ select_sglang_backend,
11
+ )
12
+ from sglang.utils import read_jsonl
13
+
14
+
15
+ def get_example(lines, i, answer):
16
+ prompt = "Question: " + lines[i]["question"] + lines[i]["passage"] + "\nAnswer:"
17
+ if answer:
18
+ prompt += str(lines[i]["answer"])
19
+ return prompt
20
+
21
+
22
+ def few_shot_examples(lines, k):
23
+ prompts = ""
24
+ for i in range(k):
25
+ prompts += get_example(lines, i, True) + "\n\n"
26
+ return prompts
27
+
28
+
29
+ def main(args):
30
+ # Select backend
31
+ set_default_backend(select_sglang_backend(args))
32
+
33
+ # Read data
34
+ train_data_path = args.train_data_path
35
+ test_data_path = args.test_data_path
36
+ lines_train = list(read_jsonl(train_data_path))
37
+ lines_test = list(read_jsonl(test_data_path))
38
+
39
+ # Construct prompts
40
+ num_questions = args.num_questions
41
+ num_shots = args.num_shots
42
+ few_shots = few_shot_examples(lines_train, num_shots)
43
+
44
+ questions = []
45
+ answer = []
46
+ for i in range(len(lines_test[:num_questions])):
47
+ questions.append(get_example(lines_test, i, False))
48
+ answer.append(str(lines_test[i]["answer"]))
49
+ arguments = [{"question": q} for q in questions]
50
+
51
+ #####################################
52
+ ######### SGL Program Begin #########
53
+ #####################################
54
+
55
+ import sglang as sgl
56
+
57
+ @sgl.function
58
+ def few_shot_boolq(s, question):
59
+ s += few_shots + question
60
+ s += sgl.gen("answer", max_tokens=5, stop=["\n"])
61
+
62
+ #####################################
63
+ ########## SGL Program End ##########
64
+ #####################################
65
+
66
+ # Run requests
67
+ tic = time.perf_counter()
68
+ states = few_shot_boolq.run_batch(
69
+ arguments,
70
+ temperature=0,
71
+ num_threads=args.parallel,
72
+ progress_bar=True,
73
+ )
74
+ latency = time.perf_counter() - tic
75
+
76
+ preds = []
77
+ for i in range(len(states)):
78
+ preds.append(states[i]["answer"])
79
+
80
+ # Compute accuracy
81
+ acc = np.mean(np.array(preds) == np.array(answer))
82
+
83
+ # Compute speed
84
+ num_output_tokens = sum(
85
+ s.get_meta_info("answer")["completion_tokens"] for s in states
86
+ )
87
+ output_throughput = num_output_tokens / latency
88
+
89
+ # Print results
90
+ print(f"Accuracy: {acc:.3f}")
91
+ print(f"Latency: {latency:.3f} s")
92
+ print(f"Output throughput: {output_throughput:.3f} token/s")
93
+
94
+ # Results
95
+ with open(args.result_file, "a") as fout:
96
+ value = {
97
+ "task": "boolq",
98
+ "backend": args.backend,
99
+ "num_gpus": 1,
100
+ "latency": round(latency, 3),
101
+ "accuracy": round(acc, 3),
102
+ "num_requests": args.num_questions,
103
+ "other": {
104
+ "num_questions": args.num_questions,
105
+ "parallel": args.parallel,
106
+ },
107
+ }
108
+ fout.write(json.dumps(value) + "\n")
109
+
110
+
111
+ if __name__ == "__main__":
112
+ parser = argparse.ArgumentParser()
113
+ parser.add_argument("--num-shots", type=int, default=5)
114
+ parser.add_argument(
115
+ "--train-data-path", type=str, default="./boolq/data/train-00000-of-00001.json"
116
+ )
117
+ parser.add_argument(
118
+ "--test-data-path",
119
+ type=str,
120
+ default="./boolq/data/validation-00000-of-00001.json",
121
+ )
122
+ parser.add_argument("--num-questions", type=int, default=200)
123
+ args = add_common_sglang_args_and_parse(parser)
124
+ main(args)
sglang/benchmark/boolq/convert_parquet_to_json.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import pyarrow.parquet as pq
4
+
5
+
6
+ def convert_parquet_to_json(input_file, output_file):
7
+ # read parquet file
8
+ table = pq.read_table(input_file)
9
+
10
+ # turn parquet data to dataframe
11
+ df = table.to_pandas()
12
+
13
+ # turn dataframe to json form
14
+ json_data = df.to_json(orient="records", lines=True)
15
+
16
+ # write json to file
17
+ with open(output_file, "w") as f:
18
+ f.write(json_data)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ if len(sys.argv) != 3:
23
+ print("Usage:python convert_parquet_to_json.py <input_file> <output_file>")
24
+
25
+ input_file = sys.argv[1]
26
+ output_file = sys.argv[2]
27
+
28
+ convert_parquet_to_json(input_file, output_file)
sglang/benchmark/boolq/parquet_to_json.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #define input and output direction
4
+ input_dir="./boolq/data"
5
+ output_dir="./boolq/data"
6
+
7
+ #define files needed to be handled
8
+ files=(
9
+ "train-00000-of-00001.parquet"
10
+ "validation-00000-of-00001.parquet"
11
+ )
12
+
13
+ #foe files above, use python script to convert the form
14
+ for file in "${files[@]}"; do
15
+ input_file="${input_dir}/${file}"
16
+ output_file="${output_dir}/${file%.parquet}.json"
17
+
18
+ echo "Converting ${input_file} to ${output_file} ..."
19
+ python3 convert_parquet_to_json.py "${input_file}" "${output_file}"
20
+
21
+ if [ $? -eq 0 ]; then
22
+ echo "Conversion successful: ${output_file}"
23
+ else
24
+ echo "Conversion failed: ${input_file}"
25
+ fi
26
+ done
sglang/benchmark/ceval/README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Download data
2
+ ```
3
+ git lfs clone https://huggingface.co/datasets/ceval/ceval-exam
4
+ ```
5
+
6
+ ## Run benchmark
7
+
8
+ ### Benchmark sglang
9
+ ```
10
+ python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
11
+ ```
12
+
13
+ ```
14
+ python3 bench_sglang.py
15
+ ```
sglang/benchmark/ceval/bench_sglang.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import random
5
+ import re
6
+ import time
7
+
8
+ import numpy as np
9
+ from datasets import load_dataset
10
+
11
+ from sglang.lang.api import set_default_backend
12
+ from sglang.test.test_utils import (
13
+ add_common_sglang_args_and_parse,
14
+ select_sglang_backend,
15
+ )
16
+
17
+ choices = ["A", "B", "C", "D"]
18
+
19
+
20
+ def get_one_example(line, include_answer):
21
+ res = line["question"]
22
+ res += f"\nA. {line['A']}"
23
+ res += f"\nB. {line['B']}"
24
+ res += f"\nC. {line['C']}"
25
+ res += f"\nD. {line['D']}"
26
+
27
+ if include_answer:
28
+ res += f"\nAnswer: {line['answer']} \n\n"
29
+ return res
30
+
31
+
32
+ def get_few_shot_examples(lines):
33
+ res = ""
34
+ for line in lines:
35
+ res += get_one_example(line, True) + "\n\n"
36
+ return res
37
+
38
+
39
+ def get_answer_value(response):
40
+ pattern = r"(Answer:|answer:|答案是|答案是:|正确答案是:|答案:|Assistant:)\s*([A-D])(?![\w])"
41
+ match = re.search(pattern, response)
42
+
43
+ if match:
44
+ return match.group(2)
45
+
46
+ return random.choice(choices)
47
+
48
+
49
+ def main(args):
50
+ # Read data && Construct prompts
51
+ arguments = []
52
+ labels = []
53
+ examples = "examples:\n"
54
+ data_path = args.data_path
55
+ for subject in os.listdir(data_path):
56
+ subject_path = os.path.join(data_path, subject)
57
+ if os.path.isdir(subject_path) and subject != ".git":
58
+ dataset = load_dataset(data_path, name=subject)
59
+ dev_lines_temp = dataset["dev"]
60
+ val_lines_temp = dataset["val"]
61
+ few_shot_examples = get_few_shot_examples(dev_lines_temp)
62
+ examples += f"{few_shot_examples}"
63
+ for val_line in val_lines_temp:
64
+ arguments.append(
65
+ {
66
+ "examples": few_shot_examples,
67
+ "question": get_one_example(val_line, False),
68
+ }
69
+ )
70
+ labels.append(val_line["answer"])
71
+
72
+ #####################################
73
+ ######### SGL Program Begin #########
74
+ #####################################
75
+
76
+ import sglang as sgl
77
+
78
+ @sgl.function
79
+ def few_shot_ceval(s, examples, question):
80
+ s += examples + question + sgl.gen("Answer")
81
+
82
+ #####################################
83
+ ########## SGL Program End ##########
84
+ #####################################
85
+
86
+ num_questions = args.num_questions if args.num_questions else len(arguments)
87
+
88
+ # Select backend
89
+ set_default_backend(select_sglang_backend(args))
90
+
91
+ # Run requests
92
+ tic = time.perf_counter()
93
+ states = few_shot_ceval.run_batch(
94
+ arguments[:num_questions],
95
+ temperature=0,
96
+ num_threads=args.parallel,
97
+ progress_bar=True,
98
+ )
99
+ latency = time.perf_counter() - tic
100
+
101
+ preds = [get_answer_value(states[i]["Answer"]) for i in range(num_questions)]
102
+
103
+ # Compute accuracy
104
+ acc = np.mean(np.array(preds) == np.array(labels[:num_questions]))
105
+
106
+ # Compute speed
107
+ num_output_tokens = sum(
108
+ s.get_meta_info("Answer")["completion_tokens"] for s in states
109
+ )
110
+ output_throughput = num_output_tokens / latency
111
+
112
+ # Print results
113
+ print(f"Accuracy: {acc:.3f}")
114
+ print(f"Latency: {latency:.3f} s")
115
+ print(f"Output throughput: {output_throughput:.3f} token/s")
116
+
117
+ # Write results
118
+ with open(args.result_file, "a") as fout:
119
+ value = {
120
+ "task": "ceval",
121
+ "backend": args.backend,
122
+ "num_gpus": 1,
123
+ "latency": round(latency, 3),
124
+ "accuracy": round(acc, 3),
125
+ "num_requests": args.num_questions,
126
+ "other": {
127
+ "parallel": args.parallel,
128
+ },
129
+ }
130
+ fout.write(json.dumps(value) + "\n")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ parser = argparse.ArgumentParser()
135
+ parser.add_argument("--data-path", type=str, default="ceval/ceval-exam")
136
+ parser.add_argument("--num-questions", type=int, default=None)
137
+ args = add_common_sglang_args_and_parse(parser)
138
+ main(args)