diff --git a/early_exit_20250817_layers_5_kl0_25/README.md b/early_exit_20250817_layers_5_kl0_25/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bf52563f59b859deb31c4633556ba81f89dfe6f --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eae2a5cd25fc886b86de26d620e8de30e93eff6ff5f61f15d9f9d2a690948cf3 +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_1000/README.md b/early_exit_20250817_layers_5_kl0_25/step_1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_1000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_1000/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_1000/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..37a7ff0baccb95ff3efa034d1d109df98528218d --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_1000/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81db34bf182c82165bddb90d983fa581eb099521374db58cf6ac7dc0bba65af7 +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_1000/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_1000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_1000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_1500/README.md b/early_exit_20250817_layers_5_kl0_25/step_1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_1500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_1500/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_1500/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..936b8c7d3737b805fc2d128b66cd1e58a9f25060 --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_1500/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d20c223a02c8b5b58cd07ec15727e8e3404085d7451de2f3396930baa5450a +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_1500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_25/step_1500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..10dc5c10169f498027c2e33a0c47bf6000e26055 --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_1500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_1500/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_1500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_1500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_2000/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_2000/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..94fbee66bc390e8128f3d8d76a6c62c82d6589ce --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_2000/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d663ccae1cfdef9afbc1a5099f3636403124f39bcd215b8445a2404ef65ee80a +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_2000/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_2000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_2000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_2500/README.md b/early_exit_20250817_layers_5_kl0_25/step_2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_2500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_2500/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_2500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_2500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_3000/README.md b/early_exit_20250817_layers_5_kl0_25/step_3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_3000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_3000/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_3000/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0f03257ce122feb7b43abc6d95e5fb60b6b9b89 --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_3000/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cdce6f732afb18e33d64e706d19a7f300790c3154b331594e17ae7941ffd42e +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_3000/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_3000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_3000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_3500/README.md b/early_exit_20250817_layers_5_kl0_25/step_3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_3500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_3500/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_3500/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..de4f48320e7e6c4d02a0802b44ecf6cfab175d5a --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_3500/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cfd8cc776da418c1adc46643efd60e8e759a44ff8dc66e0dc013aafc18c4c2d +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_4000/README.md b/early_exit_20250817_layers_5_kl0_25/step_4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_4000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_4000/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_4000/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..59ccf4b37adc251ea116764f4e9521415921f48b --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_4000/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceadc0e493cbde0055cd59c1824859a944d6b4661fe82dfb468925eff864c968 +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_4000/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_4000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_4000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_4500/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_4500/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5c664a5011747ee80ff23416ce9cc4b6305f55b --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_4500/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338e907e19924c621dedcaf674645bda94adf44054138c00195d78b95f3815f7 +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_500/README.md b/early_exit_20250817_layers_5_kl0_25/step_500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_25/step_500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..10dc5c10169f498027c2e33a0c47bf6000e26055 --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_500/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_5000/early_exit_probes.pt b/early_exit_20250817_layers_5_kl0_25/step_5000/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..19bde47615b581a5775f84dc72c42d1082b3b311 --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_5000/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8642d1bd2e33bb195ecb07b97f9910ced7415b458a084fb25f87cc1fd96dc2d +size 94502251 diff --git a/early_exit_20250817_layers_5_kl0_25/step_5000/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_5000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_5000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_5500/README.md b/early_exit_20250817_layers_5_kl0_25/step_5500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_5500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_25/step_5500/metadata.json b/early_exit_20250817_layers_5_kl0_25/step_5500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_25/step_5500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_1000/README.md b/early_exit_20250817_layers_5_kl0_5/step_1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_1000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_1000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_1000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_1000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_1000/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_1000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_1000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_1500/README.md b/early_exit_20250817_layers_5_kl0_5/step_1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_1500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_1500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_1500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_1500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_1500/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_1500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_1500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_2000/README.md b/early_exit_20250817_layers_5_kl0_5/step_2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_2000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_2000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_2000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_2000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_2000/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_2000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_2000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_2500/README.md b/early_exit_20250817_layers_5_kl0_5/step_2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_2500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_2500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_2500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_2500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_2500/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_2500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_2500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_3000/README.md b/early_exit_20250817_layers_5_kl0_5/step_3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_3000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_3000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_3000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_3000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_3000/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_3000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_3000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_3500/README.md b/early_exit_20250817_layers_5_kl0_5/step_3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_3500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_3500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_3500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_3500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_3500/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_3500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_3500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_4000/README.md b/early_exit_20250817_layers_5_kl0_5/step_4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_4000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_4000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_4000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_4000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_4000/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_4000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_4000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_4500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_4500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_4500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_4500/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_4500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_4500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_500/README.md b/early_exit_20250817_layers_5_kl0_5/step_500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl0_5/step_500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..18aa2e1df08e9edc13e2e990cd1da094b2f648ca --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl0_5/step_500/metadata.json b/early_exit_20250817_layers_5_kl0_5/step_500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl0_5/step_500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_1000/README.md b/early_exit_20250817_layers_5_kl1_0/step_1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_1000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_2000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl1_0/step_2000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..10dc5c10169f498027c2e33a0c47bf6000e26055 --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_2000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_2500/README.md b/early_exit_20250817_layers_5_kl1_0/step_2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_2500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_2500/metadata.json b/early_exit_20250817_layers_5_kl1_0/step_2500/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_2500/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_3500/README.md b/early_exit_20250817_layers_5_kl1_0/step_3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_3500/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_3500/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl1_0/step_3500/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..10dc5c10169f498027c2e33a0c47bf6000e26055 --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_3500/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_4000/README.md b/early_exit_20250817_layers_5_kl1_0/step_4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_4000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0/step_4000/metadata.json b/early_exit_20250817_layers_5_kl1_0/step_4000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0/step_4000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_1000/README.md b/early_exit_20250817_layers_5_kl1_0_rank8/step_1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_1000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_1000/metadata.json b/early_exit_20250817_layers_5_kl1_0_rank8/step_1000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_1000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/README.md b/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9116cf5f47dbdbf5101cf322fa6ac870a51060d --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/metadata.json b/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_2000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/README.md b/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9116cf5f47dbdbf5101cf322fa6ac870a51060d --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/metadata.json b/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_3000/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_4000/README.md b/early_exit_20250817_layers_5_kl1_0_rank8/step_4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb0893c2a4d79d9bc98c4f39e5b899f11c962dfb --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_4000/README.md @@ -0,0 +1,207 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.0 \ No newline at end of file diff --git a/early_exit_20250817_layers_5_kl1_0_rank8/step_4000/early_exiter/adapter_config.json b/early_exit_20250817_layers_5_kl1_0_rank8/step_4000/early_exiter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9116cf5f47dbdbf5101cf322fa6ac870a51060d --- /dev/null +++ b/early_exit_20250817_layers_5_kl1_0_rank8/step_4000/early_exiter/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "k_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/early_exit_20250818_layers_5_kl1_0/metadata.json b/early_exit_20250818_layers_5_kl1_0/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4453fca5915cf996beb71cc25dfa4e16aa14609c --- /dev/null +++ b/early_exit_20250818_layers_5_kl1_0/metadata.json @@ -0,0 +1,13 @@ +{ + "base_model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "exitable_layer_idxs": [ + 5.0, + 10.0, + 15.0, + 20.0, + 25.0, + Infinity + ], + "total_exitable_layers": 5, + "has_early_exit_probes": true +} \ No newline at end of file diff --git a/early_exit_20250818_layers_5_kl1_0/step_3000/early_exit_probes.pt b/early_exit_20250818_layers_5_kl1_0/step_3000/early_exit_probes.pt new file mode 100644 index 0000000000000000000000000000000000000000..3eaa7e1fc291f3db6515d174a696b54d5f195feb --- /dev/null +++ b/early_exit_20250818_layers_5_kl1_0/step_3000/early_exit_probes.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdb3aa8117033d2ecca9aa162e883173380c23f3bfab7d8f1f68b0545de80f87 +size 94502251