Lekr0 commited on Apr 13

Commit

a402b9b

verified ·

1 Parent(s): 61ba51e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_112500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_144500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_164000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/adapter_config.json +43 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/README.md +207 -0
progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/adapter_config.json +43 -0
sglang/benchmark/asr/README.md +166 -0
sglang/benchmark/asr/bench_sglang.py +404 -0
sglang/benchmark/bench_attention_sink/bench_attention_sink_triton.py +250 -0
sglang/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py +130 -0
sglang/benchmark/bench_rope/benchmark_rope_index.py +425 -0
sglang/benchmark/benchmark_batch/benchmark_batch.py +193 -0
sglang/benchmark/benchmark_batch/benchmark_tokenizer.py +237 -0
sglang/benchmark/benchmark_vllm_060/README.md +89 -0
sglang/benchmark/blog_v0_2/405b_sglang.sh +24 -0
sglang/benchmark/blog_v0_2/405b_trt.sh +17 -0
sglang/benchmark/blog_v0_2/405b_vllm.sh +24 -0
sglang/benchmark/blog_v0_2/README.md +164 -0
sglang/benchmark/blog_v0_2/config.md +100 -0
sglang/benchmark/boolq/README.md +19 -0
sglang/benchmark/boolq/bench_sglang.py +124 -0
sglang/benchmark/boolq/convert_parquet_to_json.py +28 -0
sglang/benchmark/boolq/parquet_to_json.sh +26 -0
sglang/benchmark/ceval/README.md +15 -0
sglang/benchmark/ceval/bench_sglang.py +138 -0

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_39000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_44000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_76500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_81500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_89000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_0_step_95000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_112500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_117000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_144500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_148500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_164000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_169500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_188500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_1_step_189500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_202500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_217000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /workspace/Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen3-8B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

progress/SpecForge/outputs/qwen3-8b-dflash-lora/epoch_2_step_218000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

sglang/benchmark/asr/README.md ADDED Viewed

	@@ -0,0 +1,166 @@

+# ASR Benchmark
+This benchmark evaluates the performance and accuracy (Word Error Rate - WER) of Automatic Speech Recognition (ASR) models served via SGLang.
+## Supported Models
+- `openai/whisper-large-v3`
+- `openai/whisper-large-v3-turbo`
+## Setup
+Install the required dependencies:
+```bash
+apt install ffmpeg
+pip install librosa soundfile datasets evaluate jiwer transformers openai torchcodec torch
+```
+## Running the Benchmark
+### 1. Start SGLang Server
+Launch the SGLang server with a Whisper model:
+```bash
+python -m sglang.launch_server --model-path openai/whisper-large-v3 --port 30000
+```
+### 2. Run the Benchmark Script
+Basic usage (using chat completions API):
+```bash
+python bench_sglang.py --base-url http://localhost:30000 --model openai/whisper-large-v3 --n-examples 10
+```
+Using the OpenAI-compatible transcription API:
+```bash
+python bench_sglang.py \
+    --base-url http://localhost:30000 \
+    --model openai/whisper-large-v3 \
+    --api-type transcription \
+    --language English \
+    --n-examples 10
+```
+Run with streaming and show real-time output:
+```bash
+python bench_sglang.py \
+    --base-url http://localhost:30000 \
+    --model openai/whisper-large-v3 \
+    --api-type transcription \
+    --stream \
+    --show-predictions \
+    --concurrency 1
+```
+Run with higher concurrency and save results:
+```bash
+python bench_sglang.py \
+    --base-url http://localhost:30000 \
+    --model openai/whisper-large-v3 \
+    --concurrency 8 \
+    --n-examples 100 \
+    --output results.json \
+    --show-predictions
+```
+## Arguments
+| Argument | Description | Default |
+|----------|-------------|---------|
+| `--base-url` | SGLang server URL | `http://localhost:30000` |
+| `--model` | Model name on the server | `openai/whisper-large-v3` |
+| `--dataset` | HuggingFace dataset for evaluation | `D4nt3/esb-datasets-earnings22-validation-tiny-filtered` |
+| `--split` | Dataset split to use | `validation` |
+| `--concurrency` | Number of concurrent requests | `4` |
+| `--n-examples` | Number of examples to process (`-1` for all) | `-1` |
+| `--output` | Path to save results as JSON | `None` |
+| `--show-predictions` | Display sample predictions | `False` |
+| `--print-n` | Number of samples to display | `5` |
+| `--api-type` | API to use: `chat` (chat completions) or `transcription` (audio transcriptions) | `chat` |
+| `--language` | Language for transcription API (e.g., `English`, `en`) | `None` |
+| `--stream` | Enable streaming mode for transcription API | `False` |
+## Metrics
+The benchmark outputs:
+| Metric | Description |
+|--------|-------------|
+| **Total Requests** | Number of successful ASR requests processed |
+| **WER** | Word Error Rate (lower is better), computed using the `evaluate` library |
+| **Average Latency** | Mean time per request (seconds) |
+| **Median Latency** | 50th percentile latency (seconds) |
+| **95th Latency** | 95th percentile latency (seconds) |
+| **Throughput** | Requests processed per second |
+| **Token Throughput** | Output tokens per second |
+## Example Output
+```bash
+python bench_sglang.py --api-type transcription --concurrency 128 --model openai/whisper-large-v3 --show-predictions
+Loading dataset: D4nt3/esb-datasets-earnings22-validation-tiny-filtered...
+Using API type: transcription
+Repo card metadata block was not found. Setting CardData to empty.
+WARNING:huggingface_hub.repocard:Repo card metadata block was not found. Setting CardData to empty.
+Performing warmup...
+Processing 511 samples...
+------------------------------
+Results for openai/whisper-large-v3:
+Total Requests: 511
+WER: 12.7690
+Average Latency: 1.3602s
+Median Latency: 1.2090s
+95th Latency: 2.9986s
+Throughput: 19.02 req/s
+Token Throughput: 354.19 tok/s
+Total Test Time: 26.8726s
+------------------------------
+==================== Sample Predictions ====================
+Sample 1:
+  REF: on the use of taxonomy i you know i think it is it is early days for us to to make any clear indications to the market about the proportion that would fall under that requirement
+  PRED: on the eu taxonomy i think it is early days for us to make any clear indications to the market about the proportion that would fall under that requirement
+----------------------------------------
+Sample 2:
+  REF: so within fiscal year 2021 say 120 a 100 depending on what the micro will do and next year it is not necessarily payable in q one is we will look at what the cash flows for 2022 look like
+  PRED: so within fiscal year 2021 say $120000 $100000 depending on what the macro will do and next year it is not necessarily payable in q one is we will look at what the cash flows for 2022 look like
+----------------------------------------
+Sample 3:
+  REF: we talked about 4.7 gigawatts
+  PRED: we talked about 4.7 gigawatts
+----------------------------------------
+Sample 4:
+  REF: and you know depending on that working capital build we will we will see what that yields
+  PRED: and depending on that working capital build we will see what that yields what
+----------------------------------------
+Sample 5:
+  REF: so on on sinopec what we have agreed with sinopec way back then is that free cash flows after paying all capexs are distributed out 30 70%
+  PRED: so on sinopec what we have agreed with sinopec way back then is that free cash flows after paying all capexes are distributed out 30% 70%
+----------------------------------------
+============================================================
+```
+## Notes
+- Audio samples longer than 30 seconds are automatically filtered out (Whisper limitation)
+- The benchmark performs a warmup request before measuring performance
+- Results are normalized using the model's tokenizer when available
+- When using `--stream` with `--show-predictions`, use `--concurrency 1` for clean sequential output
+- The `--language` option accepts both full names (e.g., `English`) and ISO 639-1 codes (e.g., `en`)
+## Troubleshooting
+**Server connection refused**
+- Ensure the SGLang server is running and accessible at the specified `--base-url`
+- Check that the port is not blocked by a firewall
+**Out of memory errors**
+- Reduce `--concurrency` to lower GPU memory usage
+- Use a smaller Whisper model variant

sglang/benchmark/asr/bench_sglang.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import argparse
+import asyncio
+import base64
+import io
+import json
+import time
+from statistics import mean, median
+import httpx
+import librosa
+import numpy as np
+import soundfile
+from datasets import load_dataset
+from evaluate import load
+from openai import AsyncOpenAI, OpenAI
+from transformers import AutoTokenizer
+def to_bytes(y, sr):
+    buffer = io.BytesIO()
+    soundfile.write(buffer, y, sr, format="WAV")
+    buffer.seek(0)
+    return buffer
+async def run_asr_chat(client, model_name, y, sr):
+    """Use chat completions API with audio_url for ASR."""
+    with to_bytes(y, sr) as f:
+        audio_bytes = f.read()
+        audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+    start_time = time.perf_counter()
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio_url",
+                        "audio_url": {"url": f"data:audio/wav;base64,{audio_base64}"},
+                    }
+                ],
+            }
+        ],
+        temperature=0.0,
+    )
+    end_time = time.perf_counter()
+    asr_text = response.choices[0].message.content
+    latency = end_time - start_time
+    return latency, asr_text
+def run_asr_transcription_sync(client, model_name, y, sr, language=None):
+    """Use audio transcriptions API for ASR (sync version)."""
+    audio_buffer = to_bytes(y, sr)
+    audio_buffer.name = "audio.wav"  # OpenAI client needs a name attribute
+    start_time = time.perf_counter()
+    kwargs = {
+        "model": model_name,
+        "file": audio_buffer,
+    }
+    if language:
+        kwargs["language"] = language
+    transcription = client.audio.transcriptions.create(**kwargs)
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+    return latency, transcription.text
+def run_asr_transcription_stream_sync(
+    base_url, model_name, y, sr, language=None, show_stream=False
+):
+    """Use audio transcriptions API with streaming for ASR."""
+    audio_buffer = to_bytes(y, sr)
+    audio_bytes = audio_buffer.read()
+    data = {
+        "model": model_name,
+        "response_format": "json",
+        "stream": "true",
+    }
+    if language:
+        data["language"] = language
+    start_time = time.perf_counter()
+    text_chunks = []
+    if show_stream:
+        print("[STREAM] ", end="", flush=True)
+    with httpx.stream(
+        "POST",
+        f"{base_url}/v1/audio/transcriptions",
+        data=data,
+        files={"file": ("audio.wav", audio_bytes, "audio/wav")},
+        timeout=60.0,
+    ) as response:
+        for line in response.iter_lines():
+            if line.startswith("data: ") and not line.startswith("data: [DONE]"):
+                try:
+                    chunk = json.loads(line[6:])
+                    if "choices" in chunk and chunk["choices"]:
+                        delta = chunk["choices"][0].get("delta", {})
+                        content = delta.get("content", "")
+                        if content:
+                            text_chunks.append(content)
+                            if show_stream:
+                                print(content, end="", flush=True)
+                except json.JSONDecodeError:
+                    pass
+    if show_stream:
+        print()  # newline after stream
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+    return latency, "".join(text_chunks)
+async def run_asr_transcription(
+    client,
+    model_name,
+    y,
+    sr,
+    language=None,
+    stream=False,
+    base_url=None,
+    show_stream=False,
+):
+    """Async wrapper for transcription API (runs sync call in executor)."""
+    loop = asyncio.get_event_loop()
+    if stream:
+        return await loop.run_in_executor(
+            None,
+            run_asr_transcription_stream_sync,
+            base_url,
+            model_name,
+            y,
+            sr,
+            language,
+            show_stream,
+        )
+    return await loop.run_in_executor(
+        None, run_asr_transcription_sync, client, model_name, y, sr, language
+    )
+async def bound_asr(
+    sem,
+    client,
+    model_name,
+    tokenizer,
+    audio,
+    reference,
+    api_type="chat",
+    language=None,
+    stream=False,
+    base_url=None,
+    show_stream=False,
+):
+    async with sem:
+        try:
+            if api_type == "transcription":
+                latency, text = await run_asr_transcription(
+                    client,
+                    model_name,
+                    *audio,
+                    language=language,
+                    stream=stream,
+                    base_url=base_url,
+                    show_stream=show_stream,
+                )
+            else:
+                latency, text = await run_asr_chat(client, model_name, *audio)
+            # Calculate tokens for throughput metrics
+            num_output_tokens = len(tokenizer(text, add_special_tokens=False).input_ids)
+            # Normalize for WER evaluation
+            # Whisper tokenizer has a normalize method
+            if hasattr(tokenizer, "normalize"):
+                out = tokenizer.normalize(text)
+                ref = tokenizer.normalize(reference)
+            else:
+                out = text.lower().strip()
+                ref = reference.lower().strip()
+            return latency, num_output_tokens, out, ref
+        except Exception as e:
+            print(f"Error during ASR: {e}")
+            return None
+async def process_dataset(
+    model_name,
+    client,
+    data,
+    concurrent_request,
+    api_type="chat",
+    language=None,
+    stream=False,
+    base_url=None,
+    show_predictions=False,
+):
+    sem = asyncio.Semaphore(concurrent_request)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Warmup
+    print("Performing warmup...")
+    audio_warmup, sr_warmup = (
+        data[0]["audio"]["array"],
+        data[0]["audio"]["sampling_rate"],
+    )
+    await bound_asr(
+        sem,
+        client,
+        model_name,
+        tokenizer,
+        (audio_warmup, sr_warmup),
+        "",
+        api_type=api_type,
+        language=language,
+        stream=stream,
+        base_url=base_url,
+        show_stream=False,  # Don't show stream during warmup
+    )
+    tasks = []
+    print(f"Processing {len(data)} samples...")
+    for sample in data:
+        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+        tasks.append(
+            asyncio.create_task(
+                bound_asr(
+                    sem,
+                    client,
+                    model_name,
+                    tokenizer,
+                    (audio, sr),
+                    sample["text"],
+                    api_type=api_type,
+                    language=language,
+                    stream=stream,
+                    base_url=base_url,
+                    show_stream=show_predictions and stream,
+                )
+            )
+        )
+    results = await asyncio.gather(*tasks)
+    return [r for r in results if r is not None]
+def run_evaluation(args):
+    # Use sync client for transcription API, async for chat API
+    if args.api_type == "transcription":
+        client = OpenAI(base_url=f"{args.base_url}/v1", api_key="None")
+    else:
+        client = AsyncOpenAI(base_url=f"{args.base_url}/v1", api_key="None")
+    print(f"Loading dataset: {args.dataset}...")
+    print(f"Using API type: {args.api_type}" + (f" (streaming)" if args.stream else ""))
+    dataset = load_dataset(args.dataset, split=args.split)
+    # Filter by duration if needed (Whisper max is 30s)
+    def add_duration(sample):
+        y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+        sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
+        return sample
+    if "duration_ms" not in dataset.column_names:
+        dataset = dataset.map(add_duration)
+    dataset = dataset.filter(lambda x: x["duration_ms"] < 30000)
+    if args.n_examples > 0:
+        dataset = dataset.select(range(min(args.n_examples, len(dataset))))
+    start = time.perf_counter()
+    results = asyncio.run(
+        process_dataset(
+            args.model,
+            client,
+            dataset,
+            args.concurrency,
+            api_type=args.api_type,
+            language=args.language,
+            stream=args.stream,
+            base_url=args.base_url,
+            show_predictions=args.show_predictions,
+        )
+    )
+    total_test_time = time.perf_counter() - start
+    if not results:
+        print("No successful results to evaluate.")
+        return
+    # Metrics
+    latencies = [res[0] for res in results]
+    total_tokens = sum([res[1] for res in results])
+    predictions = [res[2] for res in results]
+    references = [res[3] for res in results]
+    wer_metric = load("wer")
+    wer_score = 100 * wer_metric.compute(references=references, predictions=predictions)
+    print("-" * 30)
+    print(f"Results for {args.model}:")
+    print(f"Total Requests: {len(results)}")
+    print(f"WER: {wer_score:.4f}")
+    print(f"Average Latency: {mean(latencies):.4f}s")
+    print(f"Median Latency: {median(latencies):.4f}s")
+    print(f"95th Latency: {np.percentile(latencies, 95):.4f}s")
+    print(f"Throughput: {len(results) / total_test_time:.2f} req/s")
+    print(f"Token Throughput: {total_tokens / total_test_time:.2f} tok/s")
+    print(f"Total Test Time: {total_test_time:.4f}s")
+    print("-" * 30)
+    if args.output:
+        with open(args.output, "w") as f:
+            import json
+            json.dump(
+                {
+                    "model": args.model,
+                    "dataset": args.dataset,
+                    "wer": wer_score,
+                    "avg_latency": mean(latencies),
+                    "throughput": len(results) / total_test_time,
+                    "token_throughput": total_tokens / total_test_time,
+                },
+                f,
+                indent=2,
+            )
+    if args.show_predictions:
+        print("\n" + "=" * 20 + " Sample Predictions " + "=" * 20)
+        num_to_show = min(args.print_n, len(results))
+        for i in range(num_to_show):
+            print(f"Sample {i+1}:")
+            print(f"  REF: {references[i]}")
+            print(f"  PRED: {predictions[i]}")
+            print("-" * 40)
+        print("=" * 60)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark sGLang ASR performance.")
+    parser.add_argument(
+        "--base-url", default="http://localhost:30000", help="sGLang server base URL"
+    )
+    parser.add_argument(
+        "--model", default="openai/whisper-large-v3", help="Model name on the server"
+    )
+    parser.add_argument(
+        "--dataset",
+        default="D4nt3/esb-datasets-earnings22-validation-tiny-filtered",
+        help="HF dataset repo",
+    )
+    parser.add_argument("--split", default="validation", help="Dataset split")
+    parser.add_argument(
+        "--concurrency", type=int, default=4, help="Number of concurrent requests"
+    )
+    parser.add_argument(
+        "--n-examples",
+        "-n",
+        type=int,
+        default=-1,
+        help="Number of examples to test (-1 for all)",
+    )
+    parser.add_argument("--output", help="Path to save results in JSON")
+    parser.add_argument(
+        "--show-predictions",
+        action="store_true",
+        help="Print sample predictions and references",
+    )
+    parser.add_argument(
+        "--print-n", type=int, default=5, help="Number of sample predictions to print"
+    )
+    parser.add_argument(
+        "--api-type",
+        choices=["chat", "transcription"],
+        default="chat",
+        help="API type to use: 'chat' for chat completions with audio_url, 'transcription' for audio.transcriptions API",
+    )
+    parser.add_argument(
+        "--language",
+        default=None,
+        help="Language code for transcription API (e.g., 'en')",
+    )
+    parser.add_argument(
+        "--stream",
+        action="store_true",
+        help="Use streaming mode for transcription API",
+    )
+    args = parser.parse_args()
+    run_evaluation(args)

sglang/benchmark/bench_attention_sink/bench_attention_sink_triton.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import argparse
+import torch
+import triton
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    decode_attention_fwd_grouped,
+)
+from sglang.srt.layers.attention.triton_ops.extend_attention import extend_attention_fwd
+# gpt oss
+head_num = 64
+head_dim = 64
+head_kv_num = 8
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["S"],  # sequence length on x-axis
+        x_vals=[128, 256, 512, 1024, 2048, 4096],
+        x_log=True,
+        line_arg="B",  # batch size as different lines
+        line_vals=[1, 8, 32, 128],
+        line_names=["B=1", "B=8", "B=32", "B=128"],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+            ("red", "-"),
+            ("cyan", "-"),
+        ],
+        ylabel="TFLOPS",
+        plot_name="attention-sink-triton-decode",
+        args={},
+    )
+)
+def benchmark_decode(B, S, H_Q, H_KV, D):
+    D_V = D
+    dtype = torch.bfloat16
+    seq_len = S
+    total_tokens = B * seq_len
+    device = torch.device("cuda")
+    sm_scale = 1.0 / (D**0.5)
+    max_kv_splits = 8
+    num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
+    # k_buffer and v_buffer represent all previous tokens
+    k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+    b_seq_len = torch.full((B,), seq_len, device="cuda")
+    kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+    kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len, dim=0)
+    kv_indices = torch.arange(total_tokens, device="cuda")
+    attn_logits1 = torch.empty(
+        (B, H_Q, max_kv_splits, D_V),
+        dtype=torch.float32,
+        device="cuda",
+    )
+    attn_lse1 = torch.empty(
+        (B, H_Q, max_kv_splits, D_V),
+        dtype=torch.float32,
+        device="cuda",
+    )
+    sink = torch.randn(H_Q, device=device, dtype=torch.float32)
+    # warmup
+    for _ in range(5):
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits1,
+            attn_lse1,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=0.0,
+            sinks=sink,
+        )
+    # benchmark
+    run_step = 500
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(run_step):
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits1,
+            attn_lse1,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=0.0,
+            sinks=sink,
+        )
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    ms = start_event.elapsed_time(end_event) / run_step
+    tflops = lambda ms: (2 * B * S * H_Q * D) * 1e-9 / ms  # must be causal
+    return tflops(ms)
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["S"],  # sequence length on x-axis
+        x_vals=[128, 256, 512, 1024, 2048, 4096],
+        x_log=True,
+        line_arg="B",  # batch size as different lines
+        line_vals=[1, 8, 32, 128],
+        line_names=["B=1", "B=8", "B=32", "B=128"],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+            ("red", "-"),
+            ("cyan", "-"),
+        ],
+        ylabel="TFLOPS",
+        plot_name="attention-sink-triton-extend",
+        args={},
+    )
+)
+def benchmark_extend(B, S, H_Q, H_KV, D):
+    # S here represents N_CTX from the test
+    dtype = torch.bfloat16
+    device = "cuda"
+    # Split S into prefix and extend lengths
+    prefill_len = S // 2  # Similar to test's N_CTX // 2
+    extend_len = S // 4  # Make extend length smaller than prefix
+    # Calculate total tokens and extend tokens
+    total_extend_tokens = B * extend_len
+    total_prefix_tokens = B * prefill_len
+    # Create query, key, value tensors for extension
+    q_extend = torch.randn(total_extend_tokens, H_Q, D, dtype=dtype, device=device)
+    k_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
+    v_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
+    o_extend = torch.empty_like(q_extend)
+    # Create key-value buffers for prefix
+    k_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
+    v_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
+    # Create index pointers
+    qo_indptr = torch.arange(0, (B + 1) * extend_len, extend_len, device=device).to(
+        torch.int32
+    )
+    kv_indptr = torch.arange(0, (B + 1) * prefill_len, prefill_len, device=device).to(
+        torch.int32
+    )
+    kv_indices = torch.arange(0, total_prefix_tokens, device=device).to(torch.int32)
+    sm_scale = 1.0 / (D**0.5)
+    # sliding_window = 128  # From GPT-OSS config, skip for now
+    sliding_window = -1
+    sink = torch.randn(H_Q, device=device, dtype=torch.float32)
+    # warmup
+    for _ in range(5):
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask=None,
+            is_causal=True,
+            mask_indptr=None,
+            max_len_extend=extend_len,
+            sm_scale=sm_scale,
+            sliding_window_size=sliding_window,
+            sinks=sink,
+        )
+    # benchmark
+    run_step = 500
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(run_step):
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask=None,
+            is_causal=True,
+            mask_indptr=None,
+            max_len_extend=extend_len,
+            sm_scale=sm_scale,
+            sliding_window_size=sliding_window,
+            sinks=sink,
+        )
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    ms = start_event.elapsed_time(end_event) / run_step
+    # FLOPS calculation: each attention operation requires 2 multiplications per element
+    total_flops = 2 * total_extend_tokens * H_Q * (prefill_len + extend_len / 2) * D
+    tflops = lambda ms: total_flops * 1e-12 / (ms * 1e-3)  # convert to TFLOPS
+    return tflops(ms)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bench", type=str, default="all", help="all, extend, decode")
+    args = parser.parse_args()
+    kwargs = {
+        "H_Q": head_num,
+        "H_KV": head_kv_num,
+        "D": head_dim,
+    }
+    if args.bench in ["all", "decode"]:
+        benchmark_decode.run(print_data=True, show_plots=False, **kwargs)
+    if args.bench in ["all", "extend"]:
+        benchmark_extend.run(print_data=True, show_plots=False, **kwargs)
+    print("Benchmark finished!")

sglang/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Benchmark with lots of common prefixes. Used to benchmark prefix caching performance.
+#
+# Launch a server:
+# python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --log-level-http warning
+import random
+import string
+import time
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import sglang as sgl
+from sglang import set_default_backend
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+def generate_random_string(token_length: int) -> str:
+    random_string = "".join(
+        random.choices(string.ascii_letters + string.digits, k=token_length * 100)
+    )
+    tokenized_output = tokenizer.encode(random_string, add_special_tokens=False)[
+        :token_length
+    ]
+    if len(tokenized_output) < token_length:
+        tokenized_output = tokenized_output + [tokenizer.pad_token_id] * (
+            token_length - len(tokenized_output)
+        )
+    decoded_string = tokenizer.decode(tokenized_output, skip_special_tokens=False)
+    return decoded_string
+def generate_unique_prefix(base_text, index):
+    return str(index) + base_text[len(str(index)) :]
+@sgl.function
+def text_qa(s, question, gen_len):
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0, max_tokens=gen_len)
+def prepare_prompts(num_prefix, num_samples_per_prefix, prefix_length, suffix_length):
+    base_prefix = generate_random_string(prefix_length)
+    tot_input_len = 0
+    all_prompts = []
+    for i in tqdm(range(num_prefix), desc="prepare prompts"):
+        unique_prefix = generate_unique_prefix(base_prefix, i)
+        prompt_list = []
+        for j in range(num_samples_per_prefix):
+            suffix = generate_random_string(suffix_length)
+            prompt = unique_prefix + suffix
+            prompt_list.append(prompt)
+            tot_input_len += len(tokenizer.encode(prompt))
+        all_prompts.append(prompt_list)
+    return all_prompts, tot_input_len
+def test_batch_by_batch(all_prompts, gen_len):
+    backend.flush_cache()
+    tot_time = 0
+    for i in range(len(all_prompts)):
+        tic = time.perf_counter()
+        text_qa.run_batch(
+            list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
+        )
+        tot_time += time.perf_counter() - tic
+    return tot_time
+def test_batch_by_batch_with_hint(all_prompts, gen_len):
+    backend.flush_cache()
+    tot_time = 0
+    for i in range(len(all_prompts)):
+        tic = time.perf_counter()
+        # Send a hint to cache the prefix
+        text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
+        # Send the batch
+        text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
+        tot_time += time.perf_counter() - tic
+    return tot_time
+def test_send_all(all_prompts, gen_len):
+    backend.flush_cache()
+    all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
+    tic = time.perf_counter()
+    text_qa.run_batch(
+        list(zip(all_prompts, [gen_len] * len(all_prompts))),
+    )
+    tot_time = time.perf_counter() - tic
+    return tot_time
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    backend = RuntimeEndpoint("http://127.0.0.1:30000")
+    set_default_backend(backend)
+    random.seed(0)
+    num_prefix = 10
+    num_samples_per_prefix = 32
+    prefix_length = 1024
+    suffix_length = 128
+    gen_len = 1
+    all_prompts, tot_input_len = prepare_prompts(
+        num_prefix, num_samples_per_prefix, prefix_length, suffix_length
+    )
+    print(f"Total input token length: {tot_input_len}\n")
+    cost = test_batch_by_batch(all_prompts, gen_len)
+    print(f"Latency of test_batch_by_batch          : {cost:.4f} s\n")
+    cost = test_batch_by_batch_with_hint(all_prompts, gen_len)
+    print(f"Latency of test_batch_by_batch_with_hint: {cost:.4f} s\n")
+    cost = test_send_all(all_prompts, gen_len)
+    print(f"Latency of test_send_all                : {cost:.4f} s\n")

sglang/benchmark/bench_rope/benchmark_rope_index.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# This script benchmarks MRotaryEmbedding.get_rope_index_glm4v (GLM4V mrope index builder).
+# It generates synthetic multimodal input_ids + attention_mask (+ optional image/video grids),
+# runs benchmarks.
+#
+# == Usage Examples ==
+#
+# python3 benchmark_rope_index.py --device cuda --num-tokens 1024 2048 --benchmark-iter 200
+import argparse
+import math
+import time
+from dataclasses import dataclass, field
+from typing import Any
+import numpy as np
+import torch
+from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
+# -----------------------------
+# Minimal config objects
+# -----------------------------
+@dataclass
+class DummyVisionConfig:
+    spatial_merge_size: int = 2
+@dataclass
+class DummyHFConfig:
+    image_token_id: int = 32000
+    video_start_token_id: int = 32001
+    video_end_token_id: int = 32002
+    vision_config: DummyVisionConfig = field(
+        default_factory=lambda: DummyVisionConfig(spatial_merge_size=2)
+    )
+# -----------------------------
+# Helpers
+# -----------------------------
+def calculate_stats(times: list[float]) -> dict[str, float]:
+    """Calculate statistics from a list of times."""
+    times_array = np.array(times, dtype=np.float64)
+    return {
+        "mean": float(np.mean(times_array)),
+        "median": float(np.median(times_array)),
+        "p99": float(np.percentile(times_array, 99)),
+        "min": float(np.min(times_array)),
+        "max": float(np.max(times_array)),
+    }
+def _sync(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+def _approx_hw(patches: int, merge: int) -> tuple[int, int]:
+    # want (h/merge)*(w/merge) ~= patches
+    gh = int(math.sqrt(max(1, patches)))
+    gw = max(1, patches // max(1, gh))
+    return gh * merge, gw * merge
+def generate_test_data(
+    num_tokens: int,
+    batch_size: int,
+    hf_config: DummyHFConfig,
+    dtype: torch.dtype,
+    device: torch.device,
+    pad_ratio: float,
+    num_images_per_sample: int,
+    image_patch_tokens: int,
+    num_videos_per_sample: int,
+    video_patch_tokens: int,
+    seed: int,
+):
+    """
+    Generate synthetic (input_ids, attention_mask, image_grid_thw, video_grid_thw).
+    NOTE:
+      - image_grid_thw / video_grid_thw are global lists across the entire batch in encounter order,
+        matching the function's image_index/video_index behavior.
+      - image patches are represented by repeated image_token_id.
+      - video patches are represented by image_token_id wrapped with start/end tokens.
+    """
+    torch.manual_seed(seed)
+    forbidden = {
+        0,
+        hf_config.image_token_id,
+        hf_config.video_start_token_id,
+        hf_config.video_end_token_id,
+    }
+    vocab_size = 50000
+    def rand_text(n: int) -> torch.Tensor:
+        # generate random ids not in forbidden
+        out = torch.randint(1, vocab_size, (n,), device=device, dtype=torch.long)
+        # fix forbidden by +1 until ok (cheap, deterministic enough for benchmark data)
+        for bad in forbidden:
+            out = torch.where(out == bad, out + 1, out)
+        return out
+    image_grids: list[list[int]] = []
+    video_grids: list[list[int]] = []
+    input_ids = torch.zeros((batch_size, num_tokens), device=device, dtype=torch.long)
+    attention_mask = torch.zeros(
+        (batch_size, num_tokens), device=device, dtype=torch.long
+    )
+    eff_len = int(round(num_tokens * (1.0 - pad_ratio)))
+    eff_len = max(1, min(num_tokens, eff_len))
+    min_needed = 1
+    min_needed += num_images_per_sample * image_patch_tokens
+    min_needed += num_videos_per_sample * (2 + video_patch_tokens)
+    if eff_len < min_needed:
+        num_images_per_sample = 0
+        num_videos_per_sample = 0
+    for b in range(batch_size):
+        blocks: list[torch.Tensor] = []
+        reserved = (
+            num_images_per_sample * image_patch_tokens
+            + num_videos_per_sample * (2 + video_patch_tokens)
+        )
+        reserved = min(reserved, max(0, eff_len - 1))
+        text_budget = max(1, eff_len - reserved)
+        n_text_chunks = num_images_per_sample + num_videos_per_sample + 1
+        base = text_budget // n_text_chunks
+        rem = text_budget % n_text_chunks
+        text_chunks = [base + (1 if i < rem else 0) for i in range(n_text_chunks)]
+        tci = 0
+        for _ in range(num_images_per_sample):
+            blocks.append(rand_text(text_chunks[tci]))
+            tci += 1
+            blocks.append(
+                torch.full(
+                    (image_patch_tokens,),
+                    hf_config.image_token_id,
+                    device=device,
+                    dtype=torch.long,
+                )
+            )
+            h, w = _approx_hw(
+                image_patch_tokens, hf_config.vision_config.spatial_merge_size
+            )
+            image_grids.append([1, h, w])
+        for _ in range(num_videos_per_sample):
+            blocks.append(rand_text(text_chunks[tci]))
+            tci += 1
+            blocks.append(
+                torch.tensor(
+                    [hf_config.video_start_token_id], device=device, dtype=torch.long
+                )
+            )
+            blocks.append(
+                torch.full(
+                    (video_patch_tokens,),
+                    hf_config.image_token_id,
+                    device=device,
+                    dtype=torch.long,
+                )
+            )
+            blocks.append(
+                torch.tensor(
+                    [hf_config.video_end_token_id], device=device, dtype=torch.long
+                )
+            )
+            h, w = _approx_hw(
+                video_patch_tokens, hf_config.vision_config.spatial_merge_size
+            )
+            # first field = group count used by code; set to 1
+            video_grids.append([1, h, w])
+        blocks.append(rand_text(text_chunks[tci]))
+        tokens = torch.cat(blocks, dim=0)[:eff_len]
+        pad = torch.zeros(
+            (num_tokens - tokens.numel(),), device=device, dtype=torch.long
+        )
+        ids = torch.cat([tokens, pad], dim=0)
+        mask = torch.cat(
+            [
+                torch.ones((tokens.numel(),), device=device, dtype=torch.long),
+                torch.zeros(
+                    (num_tokens - tokens.numel(),), device=device, dtype=torch.long
+                ),
+            ],
+            dim=0,
+        )
+        input_ids[b] = ids
+        attention_mask[b] = mask
+    image_grid_thw = (
+        torch.tensor(image_grids, device=device, dtype=torch.long)
+        if len(image_grids)
+        else None
+    )
+    video_grid_thw = (
+        torch.tensor(video_grids, device=device, dtype=torch.long)
+        if len(video_grids)
+        else None
+    )
+    return (
+        input_ids.to(dtype=torch.long),
+        attention_mask.to(dtype=torch.long),
+        image_grid_thw,
+        video_grid_thw,
+    )
+def benchmark_rope_index(
+    model_name: str,
+    tp_size: int,
+    num_tokens: int,
+    batch_size: int,
+    pad_ratio: float,
+    spatial_merge_size: int,
+    num_images: int,
+    image_patch_tokens: int,
+    num_videos: int,
+    video_patch_tokens: int,
+    dtype: torch.dtype,
+    seed: int,
+    warmup_iter: int,
+    benchmark_iter: int,
+    device: torch.device,
+):
+    torch.manual_seed(seed)
+    hf_config = DummyHFConfig(
+        image_token_id=32000,
+        video_start_token_id=32001,
+        video_end_token_id=32002,
+        vision_config=DummyVisionConfig(spatial_merge_size=spatial_merge_size),
+    )
+    print(80 * "=")
+    print(
+        f"Evaluating: {model_name} tp_size={tp_size} "
+        f"num_tokens={num_tokens} batch={batch_size} pad_ratio={pad_ratio} "
+        f"images/sample={num_images} image_patch_tokens={image_patch_tokens} "
+        f"videos/sample={num_videos} video_patch_tokens={video_patch_tokens} "
+        f"dtype={dtype} device={device}"
+    )
+    input_ids, attention_mask, image_grid_thw, video_grid_thw = generate_test_data(
+        num_tokens=num_tokens,
+        batch_size=batch_size,
+        hf_config=hf_config,
+        dtype=dtype,
+        device=device,
+        pad_ratio=pad_ratio,
+        num_images_per_sample=num_images,
+        image_patch_tokens=image_patch_tokens,
+        num_videos_per_sample=num_videos,
+        video_patch_tokens=video_patch_tokens,
+        seed=seed,
+    )
+    # Smoke test
+    has_mm = (image_grid_thw is not None) or (video_grid_thw is not None)
+    if has_mm:
+        pos, delta = MRotaryEmbedding.get_rope_index_glm4v(
+            input_ids=input_ids,
+            hf_config=hf_config,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            attention_mask=attention_mask,
+        )
+        assert pos.shape == (3, batch_size, num_tokens)
+        assert delta.shape == (batch_size, 1)
+    # Warm up
+    for _ in range(warmup_iter):
+        if has_mm:
+            MRotaryEmbedding.get_rope_index_glm4v(
+                input_ids=input_ids,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                attention_mask=attention_mask,
+            )
+        MRotaryEmbedding.get_rope_index_glm4v(
+            input_ids=input_ids,
+            hf_config=hf_config,
+            image_grid_thw=None,
+            video_grid_thw=None,
+            attention_mask=attention_mask,
+        )
+    _sync(device)
+    # Time multimodal branch
+    multimodal_times = []
+    for _ in range(benchmark_iter):
+        _sync(device)
+        start = time.time()
+        MRotaryEmbedding.get_rope_index_glm4v(
+            input_ids=input_ids,
+            hf_config=hf_config,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            attention_mask=attention_mask,
+        )
+        _sync(device)
+        multimodal_times.append(time.time() - start)
+    # Time fallback branch
+    fallback_times = []
+    for _ in range(benchmark_iter):
+        _sync(device)
+        start = time.time()
+        MRotaryEmbedding.get_rope_index_glm4v(
+            input_ids=input_ids,
+            hf_config=hf_config,
+            image_grid_thw=None,
+            video_grid_thw=None,
+            attention_mask=attention_mask,
+        )
+        _sync(device)
+        fallback_times.append(time.time() - start)
+    multimodal_stats = calculate_stats(multimodal_times)
+    fallback_stats = calculate_stats(fallback_times)
+    print(f"\nPerformance for config (B={batch_size}, T={num_tokens}):")
+    print(
+        f"Multimodal: mean={multimodal_stats['mean']:.8f}s, "
+        f"median={multimodal_stats['median']:.8f}s, "
+        f"p99={multimodal_stats['p99']:.8f}s"
+    )
+    print(
+        f"Fallback:   mean={fallback_stats['mean']:.8f}s, "
+        f"median={fallback_stats['median']:.8f}s, "
+        f"p99={fallback_stats['p99']:.8f}s"
+    )
+    if has_mm:
+        speedup = (
+            multimodal_stats["mean"] / fallback_stats["mean"]
+            if fallback_stats["mean"] > 0
+            else float("inf")
+        )
+        print(f"Fallback Speedup over Multimodal: {speedup:.8f}x")
+    else:
+        speedup = float("nan")
+        print(
+            "[INFO] num_tokens too small for multimodal segments; skip multimodal benchmark."
+        )
+    print(f"Fallback Speedup over Multimodal: {speedup:.8f}x")
+    return multimodal_stats, fallback_stats, speedup
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark GLM4V get_rope_index_glm4v."
+    )
+    parser.add_argument("--model-name", type=str, default="GLM4V")
+    parser.add_argument("--tp-size", type=int, default=1)
+    parser.add_argument(
+        "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu"
+    )
+    parser.add_argument("--warmup-iter", type=int, default=10)
+    parser.add_argument("--benchmark-iter", type=int, default=100)
+    parser.add_argument("--dtype", type=str, choices=["int64"], default="int64")
+    parser.add_argument("--seed", type=int, default=0)
+    # token length sweep
+    parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
+    # data shape knobs
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--pad-ratio", type=float, default=0.0)
+    parser.add_argument("--spatial-merge-size", type=int, default=2)
+    parser.add_argument("--num-images", type=int, default=1)
+    parser.add_argument("--image-patch-tokens", type=int, default=256)
+    parser.add_argument("--num-videos", type=int, default=1)
+    parser.add_argument("--video-patch-tokens", type=int, default=256)
+    # output
+    parser.add_argument("--out-dir", type=str, default=".")
+    args = parser.parse_args()
+    print(args)
+    device = torch.device(args.device)
+    if args.num_tokens is None:
+        num_tokens_list = [2**i for i in range(0, 18)]
+    else:
+        num_tokens_list = args.num_tokens
+    rows: list[dict[str, Any]] = []
+    for num_tokens in num_tokens_list:
+        multimodal_stats, fallback_stats, speedup = benchmark_rope_index(
+            model_name=args.model_name,
+            tp_size=args.tp_size,
+            num_tokens=num_tokens,
+            batch_size=args.batch_size,
+            pad_ratio=args.pad_ratio,
+            spatial_merge_size=args.spatial_merge_size,
+            num_images=args.num_images,
+            image_patch_tokens=args.image_patch_tokens,
+            num_videos=args.num_videos,
+            video_patch_tokens=args.video_patch_tokens,
+            dtype=getattr(torch, args.dtype),
+            seed=args.seed,
+            warmup_iter=args.warmup_iter,
+            benchmark_iter=args.benchmark_iter,
+            device=device,
+        )

sglang/benchmark/benchmark_batch/benchmark_batch.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import concurrent.futures
+import os
+import random
+import time
+from concurrent.futures import ProcessPoolExecutor
+from statistics import mean
+import requests
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+###############################################################################
+# CONFIG
+###############################################################################
+ENDPOINT_URL = "http://127.0.0.1:30000"
+TOKENIZER_DIR = "/models/meta-llama/Llama-3.2-3B"
+# Benchmark configurations
+NUM_REQUESTS = 10  # Total number of requests (each with BATCH_SIZE prompts)
+NUM_TOKENS = 32000  # Tokens per prompt
+BATCH_SIZE = 8  # Number of prompts per request
+GEN_TOKENS = 0  # Tokens to generate per prompt
+###############################################################################
+# REQUEST GENERATION (in parallel)
+###############################################################################
+def generate_random_prompt(index, tokenizer_dir, num_tokens):
+    """Generate a single random prompt with specified token count."""
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    vocab_size = tokenizer.vocab_size
+    def generate_random_text(num_toks):
+        random_token_ids = [random.randint(0, vocab_size - 1) for _ in range(num_toks)]
+        return tokenizer.decode(random_token_ids, clean_up_tokenization_spaces=True)
+    random_text = generate_random_text(num_tokens)
+    return f"Prompt {index}: {random_text}"
+def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_dir):
+    """Generate prompts for all requests in parallel."""
+    total_prompts = num_requests * batch_size
+    all_prompts = [None] * total_prompts
+    max_workers = min(os.cpu_count() or 1, total_prompts)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(generate_random_prompt, i, tokenizer_dir, num_tokens)
+            for i in range(total_prompts)
+        ]
+        for future in tqdm(
+            concurrent.futures.as_completed(futures),
+            total=total_prompts,
+            desc="Generating prompts",
+        ):
+            index = futures.index(future)
+            all_prompts[index] = future.result()
+    batched_prompts = [
+        all_prompts[i * batch_size : (i + 1) * batch_size] for i in range(num_requests)
+    ]
+    print(
+        f"Generated {total_prompts} prompts with {num_tokens} tokens each, grouped into {num_requests} requests of {batch_size} prompts.\n"
+    )
+    return batched_prompts
+###############################################################################
+# HTTP CALLS
+###############################################################################
+def send_batch_request(endpoint, prompts, gen_tokens, request_id):
+    """Send a batch of prompts to the /generate endpoint synchronously."""
+    sampling_params = {
+        "max_new_tokens": gen_tokens,
+        "temperature": 0.7,
+        "stop": "\n",
+    }
+    data = {"text": prompts, "sampling_params": sampling_params}
+    start_time = time.perf_counter()
+    try:
+        response = requests.post(
+            endpoint.base_url + "/generate", json=data, timeout=3600
+        )
+        if response.status_code != 200:
+            error = response.json()
+            raise RuntimeError(f"Request {request_id} failed: {error}")
+        result = response.json()
+        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
+        avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
+        return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
+    except Exception as e:
+        print(f"[Request] Error for request {request_id}: {e}")
+        return request_id, 0, 0, False, len(prompts)
+def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
+    """Run the benchmark sequentially."""
+    results = []
+    num_requests = len(batched_prompts)
+    # Record start time for total latency
+    benchmark_start_time = time.perf_counter()
+    for i, batch_prompts in enumerate(batched_prompts):
+        request_id = i + 1
+        assert (
+            len(batch_prompts) == batch_size
+        ), f"Request {request_id} should have {batch_size} prompts, got {len(batch_prompts)}"
+        print(
+            f"[Request] Sending request {request_id}/{num_requests} with {len(batch_prompts)} prompts at {int(time.time()*1000)}"
+        )
+        result = send_batch_request(endpoint, batch_prompts, gen_tokens, request_id)
+        results.append(result)
+    # Calculate total latency
+    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms
+    return results, total_latency
+###############################################################################
+# RESULTS
+###############################################################################
+def process_results(results, total_latency, num_requests):
+    """Process and display benchmark results."""
+    total_time = 0
+    successful_requests = 0
+    failed_requests = 0
+    request_latencies = []
+    per_prompt_latencies = []
+    total_prompts = 0
+    for request_id, elapsed_time, avg_per_prompt, success, batch_size in results:
+        if success:
+            successful_requests += 1
+            total_prompts += batch_size
+            request_latencies.append(elapsed_time)
+            per_prompt_latencies.append(avg_per_prompt)
+            total_time += elapsed_time / 1000  # Convert to seconds
+        else:
+            failed_requests += 1
+    avg_request_latency = mean(request_latencies) if request_latencies else 0
+    avg_per_prompt_latency = mean(per_prompt_latencies) if per_prompt_latencies else 0
+    throughput = total_prompts / total_time if total_time > 0 else 0
+    print("\nBenchmark Summary:")
+    print(f"  Total requests sent:         {len(results)}")
+    print(f"  Total prompts sent:          {total_prompts}")
+    print(f"  Successful requests:         {successful_requests}")
+    print(f"  Failed requests:             {failed_requests}")
+    print(f"  Total latency (all requests): {total_latency:.2f} ms")
+    print(f"  Avg per request latency:     {avg_request_latency:.2f} ms")
+    print(f"  Avg per prompt latency:      {avg_per_prompt_latency:.2f} ms")
+    print(f"  Throughput:                  {throughput:.2f} prompts/second\n")
+###############################################################################
+# MAIN
+###############################################################################
+def main():
+    # Initialize endpoint
+    endpoint = RuntimeEndpoint(ENDPOINT_URL)
+    # Generate prompts
+    batched_prompts = prepare_all_prompts(
+        NUM_REQUESTS, BATCH_SIZE, NUM_TOKENS, TOKENIZER_DIR
+    )
+    # Flush cache before benchmark
+    # endpoint.flush_cache()
+    # Run benchmark
+    print(
+        f"Starting benchmark: NUM_TOKENS={NUM_TOKENS}, BATCH_SIZE={BATCH_SIZE}, NUM_REQUESTS={NUM_REQUESTS}\n"
+    )
+    results, total_latency = run_benchmark(
+        endpoint, batched_prompts, BATCH_SIZE, GEN_TOKENS
+    )
+    # Process and display results
+    process_results(results, total_latency, NUM_REQUESTS)
+if __name__ == "__main__":
+    random.seed(0)
+    main()

sglang/benchmark/benchmark_batch/benchmark_tokenizer.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import argparse
+import random
+import time
+from statistics import mean
+from transformers import AutoTokenizer
+from sglang.srt.utils.patch_tokenizer import patch_tokenizer
+def main():
+    args = parse_args()
+    print("Tokenizer Benchmark: Sequential vs Batch Processing")
+    print("-" * 60)
+    print(f"Tokenizer: {args.tokenizer}")
+    print(f"Functions: {', '.join(args.function)}")
+    print(f"Tokens per prompt: {args.num_tokens}")
+    print(f"Number of runs per batch size: {args.num_runs}")
+    print(f"Batch mode: {', '.join(args.batch_mode)}")
+    print("-" * 60)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
+    tokenizer = patch_tokenizer(tokenizer)
+    max_batch_size = max(args.batch_sizes)
+    token_ids = generate_random_token_ids(
+        num_prompts=max_batch_size, num_tokens=args.num_tokens, tokenizer=tokenizer
+    )
+    if "encode" in args.function:
+        prompts = [
+            tokenizer.decode(ids, clean_up_tokenization_spaces=True)
+            for ids in token_ids
+        ]
+        run_benchmark(
+            name="encode",
+            data=prompts,
+            sequential_fn=lambda batch: [tokenizer.encode(p) for p in batch],
+            batch_fn=lambda batch: tokenizer(batch),
+            batch_sizes=args.batch_sizes,
+            num_runs=args.num_runs,
+            batch_mode=args.batch_mode,
+        )
+    if "decode" in args.function:
+        # mimic DetokenizerManager's usual case
+        decode_kwargs = dict(
+            skip_special_tokens=True,
+            spaces_between_special_tokens=True,
+        )
+        run_benchmark(
+            name="decode",
+            data=token_ids,
+            sequential_fn=lambda batch: [
+                tokenizer.decode(ids, **decode_kwargs) for ids in batch
+            ],
+            batch_fn=lambda batch: tokenizer.batch_decode(batch, **decode_kwargs),
+            batch_sizes=args.batch_sizes,
+            num_runs=args.num_runs,
+            batch_mode=args.batch_mode,
+        )
+def run_benchmark(
+    *, name, data, sequential_fn, batch_fn, batch_sizes, num_runs, batch_mode
+):
+    print("\n" + "=" * 60)
+    print(f"{name.upper()} BENCHMARK")
+    print("=" * 60)
+    results = [
+        benchmark(
+            data=data,
+            batch_size=bs,
+            sequential_fn=sequential_fn,
+            batch_fn=batch_fn,
+            num_runs=num_runs,
+            batch_mode=batch_mode,
+        )
+        for bs in batch_sizes
+    ]
+    print_results(results=results, func_name=name, batch_mode=batch_mode)
+def benchmark(*, data, batch_size, sequential_fn, batch_fn, num_runs, batch_mode):
+    batch_data = data[:batch_size]
+    run_single = "single" in batch_mode
+    run_batch = "batch" in batch_mode
+    out = {"batch_size": batch_size}
+    if run_single:
+        sequential_times = measure_times(
+            fn=lambda: sequential_fn(batch_data), num_runs=num_runs
+        )
+        out |= {
+            "avg_sequential_ms": mean(sequential_times),
+            "sequential_runs": sequential_times,
+        }
+    if run_batch:
+        batch_times = measure_times(fn=lambda: batch_fn(batch_data), num_runs=num_runs)
+        out |= {
+            "avg_batch_ms": mean(batch_times),
+            "batch_runs": batch_times,
+        }
+    if run_single and run_batch:
+        out["speedup_factor"] = (
+            out["avg_sequential_ms"] / out["avg_batch_ms"]
+            if out["avg_batch_ms"] > 0
+            else 0
+        )
+    return out
+def print_results(*, results, func_name, batch_mode):
+    run_single = "single" in batch_mode
+    run_batch = "batch" in batch_mode
+    for r in results:
+        print(f"\nBatch size: {r['batch_size']}")
+        if run_single:
+            print_runs(
+                label=f"Sequential {func_name}",
+                runs=r["sequential_runs"],
+                avg=r["avg_sequential_ms"],
+            )
+        if run_batch:
+            print_runs(
+                label=f"Batch {func_name}", runs=r["batch_runs"], avg=r["avg_batch_ms"]
+            )
+        if run_single and run_batch:
+            print(f"  Speedup factor: {r['speedup_factor']:.2f}x")
+    print("\n" + "=" * 60)
+    print(f"SUMMARY: {func_name.upper()}")
+    print("=" * 60)
+    headers = ["Batch Size"]
+    if run_single:
+        headers.append("Sequential (ms)")
+    if run_batch:
+        headers.append("Batch (ms)")
+    if run_single and run_batch:
+        headers.append("Speedup")
+    print("".join(f"{h:<18}" for h in headers))
+    print("-" * (18 * len(headers)))
+    for r in results:
+        row = [f"{r['batch_size']}"]
+        if run_single:
+            row.append(f"{r['avg_sequential_ms']:.2f} ms")
+        if run_batch:
+            row.append(f"{r['avg_batch_ms']:.2f} ms")
+        if run_single and run_batch:
+            row.append(f"{r['speedup_factor']:.2f}x")
+        print("".join(f"{v:<18}" for v in row))
+def print_runs(*, label, runs, avg):
+    print(f"  {label}:")
+    for i, t in enumerate(runs):
+        print(f"    Run {i+1}: {t:.2f} ms")
+    print(f"    Average: {avg:.2f} ms")
+def measure_times(*, fn, num_runs):
+    times = []
+    for _ in range(num_runs):
+        start = time.perf_counter()
+        fn()
+        times.append((time.perf_counter() - start) * 1000)
+    return times
+def generate_random_token_ids(*, num_prompts, num_tokens, tokenizer):
+    vocab_size = tokenizer.vocab_size
+    print(f"Generating {num_prompts} random sequences with {num_tokens} tokens each...")
+    return [
+        [random.randint(0, vocab_size - 1) for _ in range(num_tokens)]
+        for _ in range(num_prompts)
+    ]
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Tokenizer Benchmark: Sequential vs Batch Processing"
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        required=True,
+        help="Tokenizer name or path (e.g. nvidia/Kimi-K2-Thinking-NVFP4)",
+    )
+    parser.add_argument(
+        "--function",
+        type=str,
+        nargs="+",
+        choices=["encode", "decode"],
+        default=["encode", "decode"],
+        help="Functions to benchmark (default: encode decode)",
+    )
+    parser.add_argument(
+        "--num-tokens",
+        type=int,
+        default=20000,
+        help="Number of tokens per prompt (default: 20000)",
+    )
+    parser.add_argument(
+        "--batch-sizes",
+        type=int,
+        nargs="+",
+        default=[1, 2, 4, 8],
+        help="Batch sizes to test (default: 1 2 4 8)",
+    )
+    parser.add_argument(
+        "--batch-mode",
+        nargs="+",
+        choices=["single", "batch"],
+        default=["single", "batch"],
+        help="Benchmark modes to run (default: single batch)",
+    )
+    parser.add_argument(
+        "--num-runs",
+        type=int,
+        default=5,
+        help="Number of runs per batch size (default: 5)",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    random.seed(0)
+    main()

sglang/benchmark/benchmark_vllm_060/README.md ADDED Viewed

	@@ -0,0 +1,89 @@

+## How to reproduce the benchmark results for SGLang v0.3.0 compared to vLLM v0.6.0
+In short, with multi step enabled, in online scenarios that we benchmarked, the Median TTFT of vLLM is **3 times** that of SGLang, and the Median ITL is **10 times** that of SGLang. Lower Median TTFT and ITL are better. vLLM's multi-step optimization did not improve throughput while ensuring lower Median TTFT and ITL. Also, under maximum throughput benchmark, if vLLM does not set gpu util to 0.95 separately and uses the default configuration instead, its maximum throughput is **lower** than that of SGLang.
+## Online benchmark results
+### Llama 3.1 8B Instruct 1 x A100 80G
+| RPS  | Num prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
+|------|-------------|--------|--------------------|-------------|-------------|------------|
+| 4    | 1200        | SGLang | 1564.17            | **31.98**   | 13.17       | **11.93**  |
+| 4    | 1200        | vLLM   | 1691.97            | **100.48**  | 14.14       | **129.32** |
+| 8    | 2400        | SGLang | 2175.02            | **35.68**   | 17.85       | **14.41**  |
+| 8    | 2400        | vLLM   | 2137.16            | **120.39**  | 17.09       | **158.63** |
+### Llama 3.1 70B Insruct 4 x H100 80G
+| RPS  | Num Prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
+|------|-------------|--------|--------------------|-------------|-------------|------------|
+| 4    | 1200        | SGLang | 3005.24            | **53.94**   | 25.03       | **21.67**  |
+| 4    | 1200        | vLLM   | 2915.60            | **179.15**  | 23.58       | **231.23** |
+| 8    | 2400        | SGLang | 4064.98            | **58.11**   | 33.07       | **24.45**  |
+| 8    | 2400        | vLLM   | 3752.38            | **207.12**  | 29.15       | **275.32** |
+## Offline benchmark results
+### Llama 3.1 8B Instruct 1 x A100 80G
+| RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
+|------|-------------|--------|--------------------|-------------------------|
+| inf  | 5000        | SGLang | 22.03              | **4281.51**             |
+| inf  | 5000        | vLLM   | 21.27              | **4132.37**             |
+### Llama 3.1 70B Insruct 4 x H100 80G
+| RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
+|------|-------------|--------|--------------------|-------------------------|
+| inf  | 5000        | SGLang | 19.84              | **3856.01**             |
+| inf  | 5000        | vLLM   | 19.04              | **3700.64**             |
+## Installation
+```bash
+# install sglang v0.3.0
+pip install --upgrade pip
+pip install "sglang[all]"==0.3.0
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
+# install vllm v0.6.0
+pip install vllm==0.6.0
+```
+## Notes
+We referred to the reproduction method in https://github.com/vllm-project/vllm/issues/8176, and added the `--num-scheduler-steps 10` parameter when starting the vLLM server. The `gpu_memory_utilization` of vLLM is by default 0.9 at both TP 1 and TP 4, while SGLang's `mem_frac` is 0.88 at TP 1 and 0.85 at TP 4, so we manually set it to 0.88 at TP 4.
+## Online benchmarks
+```bash
+# Llama 3.1 8B Instruct on 1 x A100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+# Llama 3.1 70B Instruct on 4 x H100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+# bench serving
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 2400 --request-rate 8
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 1200 --request-rate 4
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 2400 --request-rate 8
+```
+## Offline benchmarks
+```bash
+# Llama 3.1 8B Instruct on 1 x A100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+# Llama 3.1 70B Instruct on 4 x H100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+# bench serving
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 5000
+```

sglang/benchmark/blog_v0_2/405b_sglang.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+# Launch sglang
+# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quantization fp8 --disable-radix --mem-frac 0.87
+# offline
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
+# online
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35

sglang/benchmark/blog_v0_2/405b_trt.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+# Launch trtllm
+# https://github.com/sgl-project/tensorrt-demo
+# offline
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log11
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log12
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log13
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log14
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log15
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 2000 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log21
+# online
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log31
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log32
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log33
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log34
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log35

sglang/benchmark/blog_v0_2/405b_vllm.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+# Launch vllm
+# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
+# offline
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > vllm_log11
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > vllm_log12
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > vllm_log13
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > vllm_log14
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > vllm_log15
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 2000 > vllm_log21
+# online
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > vllm_log31
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35

sglang/benchmark/blog_v0_2/README.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# How to reproduce the benchmark results of SGLang
+## Prerequisite
+### Install the latest SGLang
+```bash
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+git checkout v0.2.7
+pip install --upgrade pip
+pip install -e "python[all]"
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+```
+### Set up ulimit and HF_TOKEN
+```bash
+ulimit -n 65535
+# Change the token to a real and usable one, with access permissions for the Llama 3 models.
+export HF_TOKEN=hf_token
+```
+### Launch the server
+```bash
+# Meta-Llama-3.1-8B-Instruct
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+# Meta-Llama-3.1-70B-Instruct
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
+# Meta-Llama-3-70B-Instruct-FP8
+python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8
+```
+## Benchmark
+### Hardware Requirements
+- 8B models: Single NVIDIA A100 80GB GPU
+- 70B models: 8 x NVIDIA A100 80GB GPUs with Tensor Parallelism (TP) 8
+- 70B FP8 models: 8 x NVIDIA H100 GPUs with Tensor Parallelism (TP) 8
+Please ensure you have the appropriate hardware before running the benchmarks.
+#### Offline benchmark
+```bash
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
+cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+#### Online benchmark
+```bash
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
+cat online.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+## Other
+We tried using vLLM 0.5.3.post1, but it often crashes under high loads, and it seems to have similar or worse performance compared to vLLM 0.5.2 from our partial benchmarking, so we are using the older version, vLLM 0.5.2.
+Preparation for TensorRT LLM can refer to https://github.com/sgl-project/tensorrt-demo. Specifically, we used a batch size of 512, a max input length of 8192, and a max number of tokens of 8192. The instance count for preprocessing and postprocessing in Triton Server is 16.
+```bash
+# vLLM
+pip install vllm==0.5.2
+pip install jsonschema==4.21.1
+# Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B-Instruct --disable-log-requests
+# meta-llama/Meta-Llama-3-70B-Instruct
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B-Instruct --disable-log-requests --tensor 8
+# neuralmagic/Meta-Llama-3-70B-Instruct-FP8
+python -m vllm.entrypoints.openai.api_server --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-log-requests --tensor 8
+```
+```bash
+wget https://raw.githubusercontent.com/sgl-project/sglang/main/python/sglang/bench_serving.py
+```
+```bash
+# vLLM Offline
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name sharegpt --num-prompts 3000 --output-file offline_vllm.jsonl
+cat offline_vllm.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+```bash
+# vLLM Online
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_vllm.jsonl
+cat online_vllm.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+```bash
+# TensorRT LLM Offline 8B
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_8b.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_8b.jsonl
+cat offline_trt_8b.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+```bash
+# TensorRT LLM Online 8B
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_8b.jsonl
+cat online_trt_8b.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+```bash
+# TensorRT LLM Offline 70B
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_70b.jsonl --model meta-llama/Meta-Llama-3-70B-Instruct
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_70b.jsonl
+cat offline_trt_70b.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+```bash
+# TensorRT LLM Online 70B
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_70b.jsonl
+cat online_trt_70b.jsonl | cut -d':' -f9 | cut -d',' -f1
+```

sglang/benchmark/blog_v0_2/config.md ADDED Viewed

	@@ -0,0 +1,100 @@

+### used for TensorRT LLM
+```
+{
+    "architecture": "LlamaForCausalLM",
+    "dtype": "float16",
+    "logits_dtype": "float32",
+    "vocab_size": 128256,
+    "max_position_embeddings": 8192,
+    "hidden_size": 16384,
+    "num_hidden_layers": 126,
+    "num_attention_heads": 128,
+    "num_key_value_heads": 16,
+    "head_size": 128,
+    "qk_layernorm": false,
+    "hidden_act": "silu",
+    "intermediate_size": 53248,
+    "norm_epsilon": 1e-05,
+    "position_embedding_type": "rope_gpt_neox",
+    "use_parallel_embedding": false,
+    "embedding_sharding_dim": 0,
+    "share_embedding_table": false,
+    "mapping": {
+        "world_size": 8,
+        "tp_size": 8,
+        "pp_size": 1,
+        "gpus_per_node": 8
+    },
+    "quantization": {
+        "quant_algo": "FP8",
+        "kv_cache_quant_algo": null,
+        "group_size": 128,
+        "smoothquant_val": null,
+        "has_zero_point": false,
+        "pre_quant_scale": false,
+        "exclude_modules": [
+            "lm_head"
+        ]
+    },
+    "kv_dtype": "float16",
+    "rotary_scaling": null,
+    "residual_mlp": false,
+    "moe_normalization_mode": null,
+    "rotary_base": 500000.0,
+    "moe_num_experts": 0,
+    "moe_top_k": 0,
+    "moe_tp_mode": 2,
+    "attn_bias": false,
+    "disable_weight_only_quant_plugin": false,
+    "mlp_bias": false
+}
+```
+### used for vLLM and SGLang
+```
+{
+  "_name_or_path": "dummy_fp8",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128009,
+  "hidden_act": "silu",
+  "hidden_size": 16384,
+  "initializer_range": 0.02,
+  "intermediate_size": 53248,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 128,
+  "num_hidden_layers": 126,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "activation_scheme": "static",
+    "ignored_layers": [
+      "lm_head"
+    ],
+    "quant_method": "fp8"
+  },
+  "rope_scaling": {
+    "factor": 8.0,
+    "low_freq_factor": 1.0,
+    "high_freq_factor": 4.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "max_position_embeddings": 131072,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.1",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+```

sglang/benchmark/boolq/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+## Download data
+```
+git clone https://hf-mirror.com/datasets/google/boolq
+```
+## Convert parquet to json
+```
+bash parquet_to_json.sh
+```
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+```
+python3 bench_sglang.py
+```

sglang/benchmark/boolq/bench_sglang.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import argparse
+import json
+import time
+import numpy as np
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import read_jsonl
+def get_example(lines, i, answer):
+    prompt = "Question: " + lines[i]["question"] + lines[i]["passage"] + "\nAnswer:"
+    if answer:
+        prompt += str(lines[i]["answer"])
+    return prompt
+def few_shot_examples(lines, k):
+    prompts = ""
+    for i in range(k):
+        prompts += get_example(lines, i, True) + "\n\n"
+    return prompts
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+    # Read data
+    train_data_path = args.train_data_path
+    test_data_path = args.test_data_path
+    lines_train = list(read_jsonl(train_data_path))
+    lines_test = list(read_jsonl(test_data_path))
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shots = few_shot_examples(lines_train, num_shots)
+    questions = []
+    answer = []
+    for i in range(len(lines_test[:num_questions])):
+        questions.append(get_example(lines_test, i, False))
+        answer.append(str(lines_test[i]["answer"]))
+    arguments = [{"question": q} for q in questions]
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+    import sglang as sgl
+    @sgl.function
+    def few_shot_boolq(s, question):
+        s += few_shots + question
+        s += sgl.gen("answer", max_tokens=5, stop=["\n"])
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_boolq.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+    preds = []
+    for i in range(len(states)):
+        preds.append(states[i]["answer"])
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(answer))
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+    # Results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "boolq",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument(
+        "--train-data-path", type=str, default="./boolq/data/train-00000-of-00001.json"
+    )
+    parser.add_argument(
+        "--test-data-path",
+        type=str,
+        default="./boolq/data/validation-00000-of-00001.json",
+    )
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)

sglang/benchmark/boolq/convert_parquet_to_json.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import sys
+import pyarrow.parquet as pq
+def convert_parquet_to_json(input_file, output_file):
+    # read parquet file
+    table = pq.read_table(input_file)
+    # turn parquet data to dataframe
+    df = table.to_pandas()
+    # turn dataframe to json form
+    json_data = df.to_json(orient="records", lines=True)
+    # write json to file
+    with open(output_file, "w") as f:
+        f.write(json_data)
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage:python convert_parquet_to_json.py <input_file> <output_file>")
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    convert_parquet_to_json(input_file, output_file)

sglang/benchmark/boolq/parquet_to_json.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+#define input and output direction
+input_dir="./boolq/data"
+output_dir="./boolq/data"
+#define files needed to be handled
+files=(
+        "train-00000-of-00001.parquet"
+        "validation-00000-of-00001.parquet"
+)
+#foe files above, use python script to convert the form
+for file in "${files[@]}"; do
+    input_file="${input_dir}/${file}"
+    output_file="${output_dir}/${file%.parquet}.json"
+    echo "Converting ${input_file} to ${output_file} ..."
+    python3 convert_parquet_to_json.py "${input_file}" "${output_file}"
+    if [ $? -eq 0 ]; then
+        echo "Conversion successful: ${output_file}"
+    else
+        echo "Conversion failed: ${input_file}"
+    fi
+done

sglang/benchmark/ceval/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+## Download data
+```
+git lfs clone https://huggingface.co/datasets/ceval/ceval-exam
+```
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+```
+python3 bench_sglang.py
+```

sglang/benchmark/ceval/bench_sglang.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import argparse
+import json
+import os
+import random
+import re
+import time
+import numpy as np
+from datasets import load_dataset
+from sglang.lang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+choices = ["A", "B", "C", "D"]
+def get_one_example(line, include_answer):
+    res = line["question"]
+    res += f"\nA. {line['A']}"
+    res += f"\nB. {line['B']}"
+    res += f"\nC. {line['C']}"
+    res += f"\nD. {line['D']}"
+    if include_answer:
+        res += f"\nAnswer: {line['answer']} \n\n"
+    return res
+def get_few_shot_examples(lines):
+    res = ""
+    for line in lines:
+        res += get_one_example(line, True) + "\n\n"
+    return res
+def get_answer_value(response):
+    pattern = r"(Answer:|answer:|答案是|答案是:|正确答案是:|答案:|Assistant:)\s*([A-D])(?![\w])"
+    match = re.search(pattern, response)
+    if match:
+        return match.group(2)
+    return random.choice(choices)
+def main(args):
+    # Read data && Construct prompts
+    arguments = []
+    labels = []
+    examples = "examples:\n"
+    data_path = args.data_path
+    for subject in os.listdir(data_path):
+        subject_path = os.path.join(data_path, subject)
+        if os.path.isdir(subject_path) and subject != ".git":
+            dataset = load_dataset(data_path, name=subject)
+            dev_lines_temp = dataset["dev"]
+            val_lines_temp = dataset["val"]
+            few_shot_examples = get_few_shot_examples(dev_lines_temp)
+            examples += f"{few_shot_examples}"
+            for val_line in val_lines_temp:
+                arguments.append(
+                    {
+                        "examples": few_shot_examples,
+                        "question": get_one_example(val_line, False),
+                    }
+                )
+                labels.append(val_line["answer"])
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+    import sglang as sgl
+    @sgl.function
+    def few_shot_ceval(s, examples, question):
+        s += examples + question + sgl.gen("Answer")
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+    num_questions = args.num_questions if args.num_questions else len(arguments)
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_ceval.run_batch(
+        arguments[:num_questions],
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+    preds = [get_answer_value(states[i]["Answer"]) for i in range(num_questions)]
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels[:num_questions]))
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("Answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ceval",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="ceval/ceval-exam")
+    parser.add_argument("--num-questions", type=int, default=None)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)