Upload DistilGPT-2 MAXIMUM - Best possible training (LoRA r=32, 15 epochs, 500 murlis)
Browse files- README.md +300 -0
- adapter_config.json +38 -0
- adapter_model.safetensors +3 -0
- checkpoint-286/README.md +207 -0
- checkpoint-286/adapter_config.json +38 -0
- checkpoint-286/adapter_model.safetensors +3 -0
- checkpoint-286/merges.txt +0 -0
- checkpoint-286/optimizer.pt +3 -0
- checkpoint-286/rng_state.pth +3 -0
- checkpoint-286/scheduler.pt +3 -0
- checkpoint-286/special_tokens_map.json +6 -0
- checkpoint-286/tokenizer.json +0 -0
- checkpoint-286/tokenizer_config.json +21 -0
- checkpoint-286/trainer_state.json +230 -0
- checkpoint-286/training_args.bin +3 -0
- checkpoint-286/vocab.json +0 -0
- checkpoint-308/README.md +207 -0
- checkpoint-308/adapter_config.json +38 -0
- checkpoint-308/adapter_model.safetensors +3 -0
- checkpoint-308/merges.txt +0 -0
- checkpoint-308/optimizer.pt +3 -0
- checkpoint-308/rng_state.pth +3 -0
- checkpoint-308/scheduler.pt +3 -0
- checkpoint-308/special_tokens_map.json +6 -0
- checkpoint-308/tokenizer.json +0 -0
- checkpoint-308/tokenizer_config.json +21 -0
- checkpoint-308/trainer_state.json +244 -0
- checkpoint-308/training_args.bin +3 -0
- checkpoint-308/vocab.json +0 -0
- checkpoint-330/README.md +207 -0
- checkpoint-330/adapter_config.json +38 -0
- checkpoint-330/adapter_model.safetensors +3 -0
- checkpoint-330/merges.txt +0 -0
- checkpoint-330/optimizer.pt +3 -0
- checkpoint-330/rng_state.pth +3 -0
- checkpoint-330/scheduler.pt +3 -0
- checkpoint-330/special_tokens_map.json +6 -0
- checkpoint-330/tokenizer.json +0 -0
- checkpoint-330/tokenizer_config.json +21 -0
- checkpoint-330/trainer_state.json +265 -0
- checkpoint-330/training_args.bin +3 -0
- checkpoint-330/vocab.json +0 -0
- merges.txt +0 -0
- special_tokens_map.json +6 -0
- tokenizer.json +0 -0
- tokenizer_config.json +21 -0
- training_info.json +38 -0
- vocab.json +0 -0
README.md
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
license: mit
|
| 4 |
+
tags:
|
| 5 |
+
- spiritual-ai
|
| 6 |
+
- brahma-kumaris
|
| 7 |
+
- murli
|
| 8 |
+
- distilgpt2
|
| 9 |
+
- maximum-accuracy
|
| 10 |
+
- experimental
|
| 11 |
+
- research-only
|
| 12 |
+
- peft
|
| 13 |
+
- lora
|
| 14 |
+
library_name: peft
|
| 15 |
+
base_model: distilgpt2
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
# 🕉️ Murli Assistant - DistilGPT-2 MAXIMUM (Experimental)
|
| 19 |
+
|
| 20 |
+
⚠️ **WARNING: EXPERIMENTAL MODEL - NOT FOR PRODUCTION USE** ⚠️
|
| 21 |
+
|
| 22 |
+
This model represents the **absolute maximum possible training** for DistilGPT-2 on murli content, but **quality remains insufficient** for spiritual guidance. Deployed for research, comparison, and educational purposes only.
|
| 23 |
+
|
| 24 |
+
## ⚠️ Critical Limitations
|
| 25 |
+
|
| 26 |
+
### Known Quality Issues:
|
| 27 |
+
- ❌ **Hallucinations persist** despite maximum training
|
| 28 |
+
- ❌ **Social media contamination** (Twitter URLs, @mentions in responses)
|
| 29 |
+
- ❌ **Factual inaccuracies** in spiritual concepts
|
| 30 |
+
- ❌ **Mixed content** from base model pre-training
|
| 31 |
+
- ❌ **Not suitable for spiritual guidance**
|
| 32 |
+
|
| 33 |
+
### Why This Model Exists:
|
| 34 |
+
- ✅ **Research benchmark** for small model limitations
|
| 35 |
+
- ✅ **Comparison baseline** vs larger models (Phi-2, Flan-T5)
|
| 36 |
+
- ✅ **Educational example** of training optimization
|
| 37 |
+
- ✅ **Proof that model size matters** for specialized domains
|
| 38 |
+
|
| 39 |
+
### Production Recommendation:
|
| 40 |
+
**Use Phi-2 (2.7B params) instead** - proven quality for murli chatbot.
|
| 41 |
+
|
| 42 |
+
## 🎯 Maximum Training Configuration
|
| 43 |
+
|
| 44 |
+
### This is the BEST DistilGPT-2 can do:
|
| 45 |
+
|
| 46 |
+
**LoRA Configuration (MAXIMUM):**
|
| 47 |
+
- **Rank (r):** 32 (8x better than standard r=4)
|
| 48 |
+
- **Alpha:** 64 (8x better than standard alpha=8)
|
| 49 |
+
- **Target Modules:** c_attn, c_proj, c_fc (ALL transformer layers)
|
| 50 |
+
- **Trainable Parameters:** 2.36M (2.80% of model)
|
| 51 |
+
- **Dropout:** 0.05 (reduced for maximum learning)
|
| 52 |
+
|
| 53 |
+
**Training Data (MAXIMUM):**
|
| 54 |
+
- **Murlis Used:** 500
|
| 55 |
+
- **Training Examples:** 344
|
| 56 |
+
- **Context Length:** 512 tokens (MAXIMUM)
|
| 57 |
+
- **Spiritual Concepts:** 15 detailed examples with full explanations
|
| 58 |
+
|
| 59 |
+
**Training Configuration (MAXIMUM):**
|
| 60 |
+
- **Epochs:** 15 (5x more than standard)
|
| 61 |
+
- **Effective Batch Size:** 16
|
| 62 |
+
- **Learning Rate:** 5e-05 (ultra-careful)
|
| 63 |
+
- **Warmup Steps:** 200 (4x more than standard)
|
| 64 |
+
- **Scheduler:** cosine
|
| 65 |
+
- **Weight Decay:** 0.02 (regularization)
|
| 66 |
+
- **Training Time:** ~2h 50m on CPU
|
| 67 |
+
|
| 68 |
+
**Final Training Loss:** 1.609 (66% improvement over standard 4.77)
|
| 69 |
+
|
| 70 |
+
## 📊 Progressive Training Comparison
|
| 71 |
+
|
| 72 |
+
| Version | LoRA Rank | Epochs | Murlis | Loss | Quality |
|
| 73 |
+
|---------|-----------|--------|--------|------|---------|
|
| 74 |
+
| Standard | 4 | 3 | 150 | 4.77 | ❌ Poor |
|
| 75 |
+
| Enhanced | 16 | 10 | 300 | 2.07 | ❌ Poor |
|
| 76 |
+
| **MAXIMUM** | **32** | **15** | **500** | **1.61** | ❌ **Still Poor** |
|
| 77 |
+
|
| 78 |
+
**Key Finding:** Loss improvement does NOT guarantee quality improvement for small models in specialized domains.
|
| 79 |
+
|
| 80 |
+
## 🔬 What We Learned
|
| 81 |
+
|
| 82 |
+
### Why 82M Parameters Insufficient:
|
| 83 |
+
1. **Base Model Dominance:** Pre-trained on internet text (Twitter, social media)
|
| 84 |
+
2. **Fine-tuning Limitations:** Only 2.8% of model is trainable with LoRA
|
| 85 |
+
3. **Knowledge Capacity:** Cannot store specialized domain knowledge + language ability
|
| 86 |
+
4. **Pattern vs Knowledge:** Learns format but not deep spiritual understanding
|
| 87 |
+
|
| 88 |
+
### Improvements in MAXIMUM vs Standard:
|
| 89 |
+
✅ LoRA Rank: 32 (8x from standard, 2x from enhanced)
|
| 90 |
+
✅ LoRA Alpha: 64 (8x from standard, 2x from enhanced)
|
| 91 |
+
✅ Target Modules: c_attn + c_proj + c_fc (ALL layers)
|
| 92 |
+
✅ Epochs: 15 (5x from standard, 1.5x from enhanced)
|
| 93 |
+
✅ Murlis: 500 (3.3x from standard, 1.67x from enhanced)
|
| 94 |
+
✅ Context: 512 tokens (2x from standard, 1.33x from enhanced)
|
| 95 |
+
✅ 15 detailed spiritual concepts with full explanations
|
| 96 |
+
✅ 7 different formats per murli for comprehensive learning
|
| 97 |
+
✅ Ultra-careful learning rate (5e-5)
|
| 98 |
+
✅ Maximum warmup (200 steps)
|
| 99 |
+
✅ Larger effective batch (16)
|
| 100 |
+
✅ Stronger regularization (0.02 weight decay)
|
| 101 |
+
|
| 102 |
+
### What STILL Doesn't Work:
|
| 103 |
+
- Accurate explanations of core BK concepts
|
| 104 |
+
- Freedom from social media text patterns
|
| 105 |
+
- Consistent factual responses
|
| 106 |
+
- Spiritual guidance reliability
|
| 107 |
+
|
| 108 |
+
## 💻 Usage (For Research/Demo Only)
|
| 109 |
+
|
| 110 |
+
```python
|
| 111 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 112 |
+
from peft import PeftModel
|
| 113 |
+
import torch
|
| 114 |
+
|
| 115 |
+
# Load base model
|
| 116 |
+
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
|
| 117 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 118 |
+
"distilgpt2",
|
| 119 |
+
torch_dtype=torch.float16,
|
| 120 |
+
device_map="auto"
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Load MAXIMUM adapter
|
| 124 |
+
model = PeftModel.from_pretrained(
|
| 125 |
+
base_model,
|
| 126 |
+
"eswarankrishnamurthy/murli-assistant-distilgpt2-maximum"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Chat function
|
| 130 |
+
def chat(message):
|
| 131 |
+
prompt = f"Question: {message}\nAnswer:"
|
| 132 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
| 133 |
+
outputs = model.generate(
|
| 134 |
+
**inputs,
|
| 135 |
+
max_new_tokens=150,
|
| 136 |
+
temperature=0.7,
|
| 137 |
+
top_p=0.9,
|
| 138 |
+
top_k=50,
|
| 139 |
+
repetition_penalty=1.2,
|
| 140 |
+
no_repeat_ngram_size=3
|
| 141 |
+
)
|
| 142 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 143 |
+
return response.split("Answer:", 1)[1].strip() if "Answer:" in response else response
|
| 144 |
+
|
| 145 |
+
# Test (expect mixed quality)
|
| 146 |
+
print(chat("What is soul consciousness?"))
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
## 📈 Performance Metrics
|
| 150 |
+
|
| 151 |
+
**Inference Speed (CPU):**
|
| 152 |
+
- Fastest: 1.13s
|
| 153 |
+
- Average: 2.69s
|
| 154 |
+
- Slowest: 3.55s
|
| 155 |
+
|
| 156 |
+
**Resource Usage:**
|
| 157 |
+
- RAM: ~1.5-2GB
|
| 158 |
+
- Model Size: 3.1 MB (adapter only)
|
| 159 |
+
- Base Model: 353 MB (DistilGPT-2)
|
| 160 |
+
|
| 161 |
+
**Compared to Production Models:**
|
| 162 |
+
- **Phi-2 (2.7B):** 33x larger, ⭐⭐⭐⭐⭐ quality, 5-10s inference
|
| 163 |
+
- **Flan-T5:** 3x larger, ⭐⭐⭐⭐ quality, 3-5s inference
|
| 164 |
+
- **DistilGPT-2 MAX:** Smallest, ⭐ quality, 1-3s inference
|
| 165 |
+
|
| 166 |
+
## 🎯 Use Cases
|
| 167 |
+
|
| 168 |
+
### ✅ Appropriate Uses:
|
| 169 |
+
- Research on model size limitations
|
| 170 |
+
- Benchmarking against larger models
|
| 171 |
+
- Speed comparisons
|
| 172 |
+
- Educational demonstrations
|
| 173 |
+
- Training optimization studies
|
| 174 |
+
|
| 175 |
+
### ❌ Inappropriate Uses:
|
| 176 |
+
- **Spiritual guidance** (use Phi-2 instead)
|
| 177 |
+
- **Production chatbot** (unreliable responses)
|
| 178 |
+
- **Educational content** (may teach incorrect concepts)
|
| 179 |
+
- **Public deployment** (without strong disclaimers)
|
| 180 |
+
|
| 181 |
+
## 🔧 Technical Details
|
| 182 |
+
|
| 183 |
+
**Architecture:**
|
| 184 |
+
- Base: DistilGPT-2 (82M parameters)
|
| 185 |
+
- Fine-tuning: LoRA (Low-Rank Adaptation)
|
| 186 |
+
- Modified layers: ALL attention + feed-forward layers
|
| 187 |
+
|
| 188 |
+
**Training Process:**
|
| 189 |
+
1. Connected to MongoDB Atlas (1072 murlis available)
|
| 190 |
+
2. Selected 500 murlis for training
|
| 191 |
+
3. Created 344 enhanced training examples
|
| 192 |
+
4. Trained for 15 epochs with cosine LR schedule
|
| 193 |
+
5. Achieved lowest possible loss (1.61)
|
| 194 |
+
|
| 195 |
+
**What Went Right:**
|
| 196 |
+
- Perfect training convergence
|
| 197 |
+
- Stable gradients throughout
|
| 198 |
+
- Learned BK terminology and format
|
| 199 |
+
- Fast inference speed maintained
|
| 200 |
+
|
| 201 |
+
**What Went Wrong:**
|
| 202 |
+
- Quality didn't match loss improvement
|
| 203 |
+
- Social media patterns contaminate responses
|
| 204 |
+
- Hallucinations persist despite maximum training
|
| 205 |
+
- Cannot reliably explain spiritual concepts
|
| 206 |
+
|
| 207 |
+
## 📚 Research Value
|
| 208 |
+
|
| 209 |
+
This model proves important insights for AI/ML research:
|
| 210 |
+
|
| 211 |
+
1. **Model capacity is non-negotiable** for specialized domains
|
| 212 |
+
2. **Loss metrics can be misleading** without quality evaluation
|
| 213 |
+
3. **Fine-tuning has fundamental limits** based on base model size
|
| 214 |
+
4. **More training ≠ better quality** when capacity insufficient
|
| 215 |
+
5. **Pre-training patterns dominate** small model behavior
|
| 216 |
+
|
| 217 |
+
## 🎓 Educational Message
|
| 218 |
+
|
| 219 |
+
**Before deploying any AI model:**
|
| 220 |
+
- ✅ Test quality thoroughly, not just training metrics
|
| 221 |
+
- ✅ Use appropriate model size for domain complexity
|
| 222 |
+
- ✅ Understand fine-tuning limitations
|
| 223 |
+
- ✅ Consider base model's pre-training influence
|
| 224 |
+
- ✅ Validate against production requirements
|
| 225 |
+
|
| 226 |
+
## 📊 Complete Training History
|
| 227 |
+
|
| 228 |
+
**Completed:** 2025-10-03T12:25:52.051354
|
| 229 |
+
|
| 230 |
+
**Loss Progression:**
|
| 231 |
+
- Epoch 1: 4.68 → 4.48
|
| 232 |
+
- Epoch 5: 3.44 (breakthrough)
|
| 233 |
+
- Epoch 10: 1.81 (excellent convergence)
|
| 234 |
+
- Epoch 15: **1.61 (BEST possible for DistilGPT-2)**
|
| 235 |
+
|
| 236 |
+
**Gradient Norms:** Stable (0.72 - 1.72)
|
| 237 |
+
|
| 238 |
+
## ⚖️ Final Verdict
|
| 239 |
+
|
| 240 |
+
**Technical Success:** ✅ Perfect training, lowest loss achieved
|
| 241 |
+
**Functional Success:** ❌ Quality insufficient for spiritual guidance
|
| 242 |
+
**Research Value:** ✅ Invaluable insights for model selection
|
| 243 |
+
|
| 244 |
+
### Recommendation:
|
| 245 |
+
**For production murli chatbot, use [Phi-2](https://huggingface.co/microsoft/phi-2)** fine-tuned on murli data.
|
| 246 |
+
|
| 247 |
+
This MAXIMUM model demonstrates that **small models cannot reliably handle specialized spiritual domains**, regardless of training optimization.
|
| 248 |
+
|
| 249 |
+
## 🔗 Related Models
|
| 250 |
+
|
| 251 |
+
- **Standard Version:** [murli-assistant-distilgpt2-lite](https://huggingface.co/eswarankrishnamurthy/murli-assistant-distilgpt2-lite) (LoRA r=4)
|
| 252 |
+
- **Enhanced Version:** To be released (LoRA r=16)
|
| 253 |
+
- **Recommended Production:** Phi-2 based murli assistant (coming soon)
|
| 254 |
+
|
| 255 |
+
## 📝 Citation
|
| 256 |
+
|
| 257 |
+
```bibtex
|
| 258 |
+
@misc{murli-distilgpt2-maximum,
|
| 259 |
+
author = {eswarankrishnamurthy},
|
| 260 |
+
title = {Murli Assistant - DistilGPT-2 MAXIMUM (Experimental)},
|
| 261 |
+
year = {2025},
|
| 262 |
+
publisher = {HuggingFace},
|
| 263 |
+
note = {Experimental model demonstrating small model limitations},
|
| 264 |
+
url = {https://huggingface.co/eswarankrishnamurthy/murli-assistant-distilgpt2-maximum}
|
| 265 |
+
}
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
## 📧 Contact
|
| 269 |
+
|
| 270 |
+
For questions about this research or the production Phi-2 model, please open an issue.
|
| 271 |
+
|
| 272 |
+
---
|
| 273 |
+
|
| 274 |
+
## ⚠️ DISCLAIMER
|
| 275 |
+
|
| 276 |
+
**This model is provided for research and educational purposes only.**
|
| 277 |
+
|
| 278 |
+
- Not suitable for spiritual guidance
|
| 279 |
+
- May produce incorrect or misleading information
|
| 280 |
+
- Responses should be verified against authentic murli sources
|
| 281 |
+
- Use at your own discretion
|
| 282 |
+
|
| 283 |
+
**For reliable murli assistance, consult:**
|
| 284 |
+
- Official Brahma Kumaris publications
|
| 285 |
+
- Experienced BK teachers
|
| 286 |
+
- The production Phi-2 based murli assistant (when available)
|
| 287 |
+
|
| 288 |
+
---
|
| 289 |
+
|
| 290 |
+
**Om Shanti! 🙏**
|
| 291 |
+
|
| 292 |
+
*Maximum training doesn't overcome fundamental capacity limits.*
|
| 293 |
+
*Sometimes you just need a bigger model.*
|
| 294 |
+
|
| 295 |
+
---
|
| 296 |
+
|
| 297 |
+
**Model Type:** Experimental Research Model
|
| 298 |
+
**Quality Rating:** ⭐ (Insufficient for production)
|
| 299 |
+
**Speed Rating:** ⭐⭐⭐⭐⭐ (Excellent)
|
| 300 |
+
**Recommended Alternative:** Phi-2 (⭐⭐⭐⭐⭐ quality)
|
adapter_config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "distilgpt2",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": true,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 64,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.05,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"qalora_group_size": 16,
|
| 24 |
+
"r": 32,
|
| 25 |
+
"rank_pattern": {},
|
| 26 |
+
"revision": null,
|
| 27 |
+
"target_modules": [
|
| 28 |
+
"c_fc",
|
| 29 |
+
"c_attn",
|
| 30 |
+
"c_proj"
|
| 31 |
+
],
|
| 32 |
+
"target_parameters": null,
|
| 33 |
+
"task_type": "CAUSAL_LM",
|
| 34 |
+
"trainable_token_indices": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_qalora": false,
|
| 37 |
+
"use_rslora": false
|
| 38 |
+
}
|
adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:288109ee817e5803863f9be6ca69a36eaf792db46082195c13ddf4bd8aadf6f6
|
| 3 |
+
size 9443272
|
checkpoint-286/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: distilgpt2
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:distilgpt2
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.17.1
|
checkpoint-286/adapter_config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "distilgpt2",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": true,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 64,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.05,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"qalora_group_size": 16,
|
| 24 |
+
"r": 32,
|
| 25 |
+
"rank_pattern": {},
|
| 26 |
+
"revision": null,
|
| 27 |
+
"target_modules": [
|
| 28 |
+
"c_fc",
|
| 29 |
+
"c_attn",
|
| 30 |
+
"c_proj"
|
| 31 |
+
],
|
| 32 |
+
"target_parameters": null,
|
| 33 |
+
"task_type": "CAUSAL_LM",
|
| 34 |
+
"trainable_token_indices": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_qalora": false,
|
| 37 |
+
"use_rslora": false
|
| 38 |
+
}
|
checkpoint-286/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c531ce44d74b86698076ff478d08282edd05f44ce0fbadb7eada2dfef4f62a58
|
| 3 |
+
size 9443272
|
checkpoint-286/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-286/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:851b7f58710d6cfd18a2d88bdea82116e10b43f0c32baca6e55f45d8ea193abc
|
| 3 |
+
size 18913786
|
checkpoint-286/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0d5c622a405ce95ad60dfe7cad6d689013776cdbea3c4b5f2be818c0c33edf9
|
| 3 |
+
size 13990
|
checkpoint-286/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6af2db14a991f98d11b470d828918ebb713eaa44aac1444e6ff40cc88aed4386
|
| 3 |
+
size 1064
|
checkpoint-286/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
checkpoint-286/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-286/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
checkpoint-286/trainer_state.json
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 13.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 286,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.46511627906976744,
|
| 14 |
+
"grad_norm": 0.8585774898529053,
|
| 15 |
+
"learning_rate": 2.25e-06,
|
| 16 |
+
"loss": 4.6803,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.9302325581395349,
|
| 21 |
+
"grad_norm": 0.6986469626426697,
|
| 22 |
+
"learning_rate": 4.75e-06,
|
| 23 |
+
"loss": 4.6484,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 1.372093023255814,
|
| 28 |
+
"grad_norm": 0.9079183340072632,
|
| 29 |
+
"learning_rate": 7.25e-06,
|
| 30 |
+
"loss": 4.692,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 1.8372093023255816,
|
| 35 |
+
"grad_norm": 0.9651162624359131,
|
| 36 |
+
"learning_rate": 9.750000000000002e-06,
|
| 37 |
+
"loss": 4.5727,
|
| 38 |
+
"step": 40
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 2.2790697674418605,
|
| 42 |
+
"grad_norm": 1.0011574029922485,
|
| 43 |
+
"learning_rate": 1.225e-05,
|
| 44 |
+
"loss": 4.5349,
|
| 45 |
+
"step": 50
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 2.744186046511628,
|
| 49 |
+
"grad_norm": 0.8613724708557129,
|
| 50 |
+
"learning_rate": 1.475e-05,
|
| 51 |
+
"loss": 4.4834,
|
| 52 |
+
"step": 60
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 3.186046511627907,
|
| 56 |
+
"grad_norm": 0.7298063039779663,
|
| 57 |
+
"learning_rate": 1.725e-05,
|
| 58 |
+
"loss": 4.3632,
|
| 59 |
+
"step": 70
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 3.6511627906976747,
|
| 63 |
+
"grad_norm": 0.9772608876228333,
|
| 64 |
+
"learning_rate": 1.9750000000000002e-05,
|
| 65 |
+
"loss": 4.2663,
|
| 66 |
+
"step": 80
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 4.093023255813954,
|
| 70 |
+
"grad_norm": 0.9942960143089294,
|
| 71 |
+
"learning_rate": 2.2250000000000002e-05,
|
| 72 |
+
"loss": 4.0867,
|
| 73 |
+
"step": 90
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 4.558139534883721,
|
| 77 |
+
"grad_norm": 1.3665835857391357,
|
| 78 |
+
"learning_rate": 2.4750000000000002e-05,
|
| 79 |
+
"loss": 3.8939,
|
| 80 |
+
"step": 100
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 5.0,
|
| 84 |
+
"grad_norm": 1.0870517492294312,
|
| 85 |
+
"learning_rate": 2.725e-05,
|
| 86 |
+
"loss": 3.6812,
|
| 87 |
+
"step": 110
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 5.465116279069767,
|
| 91 |
+
"grad_norm": 1.2463775873184204,
|
| 92 |
+
"learning_rate": 2.975e-05,
|
| 93 |
+
"loss": 3.4444,
|
| 94 |
+
"step": 120
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 5.930232558139535,
|
| 98 |
+
"grad_norm": 1.569264531135559,
|
| 99 |
+
"learning_rate": 3.2250000000000005e-05,
|
| 100 |
+
"loss": 3.1378,
|
| 101 |
+
"step": 130
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 6.372093023255814,
|
| 105 |
+
"grad_norm": 1.720213532447815,
|
| 106 |
+
"learning_rate": 3.475e-05,
|
| 107 |
+
"loss": 2.925,
|
| 108 |
+
"step": 140
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 6.837209302325581,
|
| 112 |
+
"grad_norm": 1.5594395399093628,
|
| 113 |
+
"learning_rate": 3.7250000000000004e-05,
|
| 114 |
+
"loss": 2.5943,
|
| 115 |
+
"step": 150
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 7.27906976744186,
|
| 119 |
+
"grad_norm": 1.1726150512695312,
|
| 120 |
+
"learning_rate": 3.9750000000000004e-05,
|
| 121 |
+
"loss": 2.4226,
|
| 122 |
+
"step": 160
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 7.7441860465116275,
|
| 126 |
+
"grad_norm": 1.3921430110931396,
|
| 127 |
+
"learning_rate": 4.2250000000000004e-05,
|
| 128 |
+
"loss": 2.1763,
|
| 129 |
+
"step": 170
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 8.186046511627907,
|
| 133 |
+
"grad_norm": 1.3491045236587524,
|
| 134 |
+
"learning_rate": 4.4750000000000004e-05,
|
| 135 |
+
"loss": 2.1677,
|
| 136 |
+
"step": 180
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 8.651162790697674,
|
| 140 |
+
"grad_norm": 0.9162717461585999,
|
| 141 |
+
"learning_rate": 4.7249999999999997e-05,
|
| 142 |
+
"loss": 1.9753,
|
| 143 |
+
"step": 190
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 9.093023255813954,
|
| 147 |
+
"grad_norm": 1.0425995588302612,
|
| 148 |
+
"learning_rate": 4.975e-05,
|
| 149 |
+
"loss": 2.1817,
|
| 150 |
+
"step": 200
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 9.55813953488372,
|
| 154 |
+
"grad_norm": 0.9056143164634705,
|
| 155 |
+
"learning_rate": 4.9411026970731805e-05,
|
| 156 |
+
"loss": 1.9631,
|
| 157 |
+
"step": 210
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 10.0,
|
| 161 |
+
"grad_norm": 1.2772588729858398,
|
| 162 |
+
"learning_rate": 4.7410673432392596e-05,
|
| 163 |
+
"loss": 1.8078,
|
| 164 |
+
"step": 220
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 10.465116279069768,
|
| 168 |
+
"grad_norm": 1.1431177854537964,
|
| 169 |
+
"learning_rate": 4.410789301364621e-05,
|
| 170 |
+
"loss": 1.9238,
|
| 171 |
+
"step": 230
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 10.930232558139535,
|
| 175 |
+
"grad_norm": 0.9406613707542419,
|
| 176 |
+
"learning_rate": 3.9694631307311836e-05,
|
| 177 |
+
"loss": 1.7594,
|
| 178 |
+
"step": 240
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 11.372093023255815,
|
| 182 |
+
"grad_norm": 0.8469231724739075,
|
| 183 |
+
"learning_rate": 3.442737104220801e-05,
|
| 184 |
+
"loss": 1.9515,
|
| 185 |
+
"step": 250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 11.837209302325581,
|
| 189 |
+
"grad_norm": 1.383780598640442,
|
| 190 |
+
"learning_rate": 2.8612226239230532e-05,
|
| 191 |
+
"loss": 1.6711,
|
| 192 |
+
"step": 260
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 12.279069767441861,
|
| 196 |
+
"grad_norm": 0.8472384214401245,
|
| 197 |
+
"learning_rate": 2.2587151977137122e-05,
|
| 198 |
+
"loss": 1.7611,
|
| 199 |
+
"step": 270
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 12.744186046511627,
|
| 203 |
+
"grad_norm": 0.8573427796363831,
|
| 204 |
+
"learning_rate": 1.6702303671786797e-05,
|
| 205 |
+
"loss": 1.734,
|
| 206 |
+
"step": 280
|
| 207 |
+
}
|
| 208 |
+
],
|
| 209 |
+
"logging_steps": 10,
|
| 210 |
+
"max_steps": 330,
|
| 211 |
+
"num_input_tokens_seen": 0,
|
| 212 |
+
"num_train_epochs": 15,
|
| 213 |
+
"save_steps": 500,
|
| 214 |
+
"stateful_callbacks": {
|
| 215 |
+
"TrainerControl": {
|
| 216 |
+
"args": {
|
| 217 |
+
"should_epoch_stop": false,
|
| 218 |
+
"should_evaluate": false,
|
| 219 |
+
"should_log": false,
|
| 220 |
+
"should_save": true,
|
| 221 |
+
"should_training_stop": false
|
| 222 |
+
},
|
| 223 |
+
"attributes": {}
|
| 224 |
+
}
|
| 225 |
+
},
|
| 226 |
+
"total_flos": 616671505022976.0,
|
| 227 |
+
"train_batch_size": 2,
|
| 228 |
+
"trial_name": null,
|
| 229 |
+
"trial_params": null
|
| 230 |
+
}
|
checkpoint-286/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bae7c5af6a706a72279095930dcc3fbcc986566e61c4724b772fc668d39a80bf
|
| 3 |
+
size 5368
|
checkpoint-286/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-308/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: distilgpt2
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:distilgpt2
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.17.1
|
checkpoint-308/adapter_config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "distilgpt2",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": true,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 64,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.05,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"qalora_group_size": 16,
|
| 24 |
+
"r": 32,
|
| 25 |
+
"rank_pattern": {},
|
| 26 |
+
"revision": null,
|
| 27 |
+
"target_modules": [
|
| 28 |
+
"c_fc",
|
| 29 |
+
"c_attn",
|
| 30 |
+
"c_proj"
|
| 31 |
+
],
|
| 32 |
+
"target_parameters": null,
|
| 33 |
+
"task_type": "CAUSAL_LM",
|
| 34 |
+
"trainable_token_indices": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_qalora": false,
|
| 37 |
+
"use_rslora": false
|
| 38 |
+
}
|
checkpoint-308/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65386ebb31d320c6087f10c748cba566eb4cbf291154d934321e3bd97273746a
|
| 3 |
+
size 9443272
|
checkpoint-308/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-308/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f8f3b48ad98031df6bdaccd21dd71e1fdbaa1570b08356e70a3149e73f1a32f
|
| 3 |
+
size 18913786
|
checkpoint-308/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee0f6bb6735bf74ef72705cd04902b5d70c7d483b64a0653e4376f76362586d1
|
| 3 |
+
size 13990
|
checkpoint-308/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc105e75cc62635527d37b2817894b42989bad169b57160704b4bef43a1a7f8f
|
| 3 |
+
size 1064
|
checkpoint-308/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
checkpoint-308/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-308/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
checkpoint-308/trainer_state.json
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 14.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 308,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.46511627906976744,
|
| 14 |
+
"grad_norm": 0.8585774898529053,
|
| 15 |
+
"learning_rate": 2.25e-06,
|
| 16 |
+
"loss": 4.6803,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.9302325581395349,
|
| 21 |
+
"grad_norm": 0.6986469626426697,
|
| 22 |
+
"learning_rate": 4.75e-06,
|
| 23 |
+
"loss": 4.6484,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 1.372093023255814,
|
| 28 |
+
"grad_norm": 0.9079183340072632,
|
| 29 |
+
"learning_rate": 7.25e-06,
|
| 30 |
+
"loss": 4.692,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 1.8372093023255816,
|
| 35 |
+
"grad_norm": 0.9651162624359131,
|
| 36 |
+
"learning_rate": 9.750000000000002e-06,
|
| 37 |
+
"loss": 4.5727,
|
| 38 |
+
"step": 40
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 2.2790697674418605,
|
| 42 |
+
"grad_norm": 1.0011574029922485,
|
| 43 |
+
"learning_rate": 1.225e-05,
|
| 44 |
+
"loss": 4.5349,
|
| 45 |
+
"step": 50
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 2.744186046511628,
|
| 49 |
+
"grad_norm": 0.8613724708557129,
|
| 50 |
+
"learning_rate": 1.475e-05,
|
| 51 |
+
"loss": 4.4834,
|
| 52 |
+
"step": 60
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 3.186046511627907,
|
| 56 |
+
"grad_norm": 0.7298063039779663,
|
| 57 |
+
"learning_rate": 1.725e-05,
|
| 58 |
+
"loss": 4.3632,
|
| 59 |
+
"step": 70
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 3.6511627906976747,
|
| 63 |
+
"grad_norm": 0.9772608876228333,
|
| 64 |
+
"learning_rate": 1.9750000000000002e-05,
|
| 65 |
+
"loss": 4.2663,
|
| 66 |
+
"step": 80
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 4.093023255813954,
|
| 70 |
+
"grad_norm": 0.9942960143089294,
|
| 71 |
+
"learning_rate": 2.2250000000000002e-05,
|
| 72 |
+
"loss": 4.0867,
|
| 73 |
+
"step": 90
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 4.558139534883721,
|
| 77 |
+
"grad_norm": 1.3665835857391357,
|
| 78 |
+
"learning_rate": 2.4750000000000002e-05,
|
| 79 |
+
"loss": 3.8939,
|
| 80 |
+
"step": 100
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 5.0,
|
| 84 |
+
"grad_norm": 1.0870517492294312,
|
| 85 |
+
"learning_rate": 2.725e-05,
|
| 86 |
+
"loss": 3.6812,
|
| 87 |
+
"step": 110
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 5.465116279069767,
|
| 91 |
+
"grad_norm": 1.2463775873184204,
|
| 92 |
+
"learning_rate": 2.975e-05,
|
| 93 |
+
"loss": 3.4444,
|
| 94 |
+
"step": 120
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 5.930232558139535,
|
| 98 |
+
"grad_norm": 1.569264531135559,
|
| 99 |
+
"learning_rate": 3.2250000000000005e-05,
|
| 100 |
+
"loss": 3.1378,
|
| 101 |
+
"step": 130
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 6.372093023255814,
|
| 105 |
+
"grad_norm": 1.720213532447815,
|
| 106 |
+
"learning_rate": 3.475e-05,
|
| 107 |
+
"loss": 2.925,
|
| 108 |
+
"step": 140
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 6.837209302325581,
|
| 112 |
+
"grad_norm": 1.5594395399093628,
|
| 113 |
+
"learning_rate": 3.7250000000000004e-05,
|
| 114 |
+
"loss": 2.5943,
|
| 115 |
+
"step": 150
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 7.27906976744186,
|
| 119 |
+
"grad_norm": 1.1726150512695312,
|
| 120 |
+
"learning_rate": 3.9750000000000004e-05,
|
| 121 |
+
"loss": 2.4226,
|
| 122 |
+
"step": 160
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 7.7441860465116275,
|
| 126 |
+
"grad_norm": 1.3921430110931396,
|
| 127 |
+
"learning_rate": 4.2250000000000004e-05,
|
| 128 |
+
"loss": 2.1763,
|
| 129 |
+
"step": 170
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 8.186046511627907,
|
| 133 |
+
"grad_norm": 1.3491045236587524,
|
| 134 |
+
"learning_rate": 4.4750000000000004e-05,
|
| 135 |
+
"loss": 2.1677,
|
| 136 |
+
"step": 180
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 8.651162790697674,
|
| 140 |
+
"grad_norm": 0.9162717461585999,
|
| 141 |
+
"learning_rate": 4.7249999999999997e-05,
|
| 142 |
+
"loss": 1.9753,
|
| 143 |
+
"step": 190
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 9.093023255813954,
|
| 147 |
+
"grad_norm": 1.0425995588302612,
|
| 148 |
+
"learning_rate": 4.975e-05,
|
| 149 |
+
"loss": 2.1817,
|
| 150 |
+
"step": 200
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 9.55813953488372,
|
| 154 |
+
"grad_norm": 0.9056143164634705,
|
| 155 |
+
"learning_rate": 4.9411026970731805e-05,
|
| 156 |
+
"loss": 1.9631,
|
| 157 |
+
"step": 210
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 10.0,
|
| 161 |
+
"grad_norm": 1.2772588729858398,
|
| 162 |
+
"learning_rate": 4.7410673432392596e-05,
|
| 163 |
+
"loss": 1.8078,
|
| 164 |
+
"step": 220
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 10.465116279069768,
|
| 168 |
+
"grad_norm": 1.1431177854537964,
|
| 169 |
+
"learning_rate": 4.410789301364621e-05,
|
| 170 |
+
"loss": 1.9238,
|
| 171 |
+
"step": 230
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 10.930232558139535,
|
| 175 |
+
"grad_norm": 0.9406613707542419,
|
| 176 |
+
"learning_rate": 3.9694631307311836e-05,
|
| 177 |
+
"loss": 1.7594,
|
| 178 |
+
"step": 240
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 11.372093023255815,
|
| 182 |
+
"grad_norm": 0.8469231724739075,
|
| 183 |
+
"learning_rate": 3.442737104220801e-05,
|
| 184 |
+
"loss": 1.9515,
|
| 185 |
+
"step": 250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 11.837209302325581,
|
| 189 |
+
"grad_norm": 1.383780598640442,
|
| 190 |
+
"learning_rate": 2.8612226239230532e-05,
|
| 191 |
+
"loss": 1.6711,
|
| 192 |
+
"step": 260
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 12.279069767441861,
|
| 196 |
+
"grad_norm": 0.8472384214401245,
|
| 197 |
+
"learning_rate": 2.2587151977137122e-05,
|
| 198 |
+
"loss": 1.7611,
|
| 199 |
+
"step": 270
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 12.744186046511627,
|
| 203 |
+
"grad_norm": 0.8573427796363831,
|
| 204 |
+
"learning_rate": 1.6702303671786797e-05,
|
| 205 |
+
"loss": 1.734,
|
| 206 |
+
"step": 280
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 13.186046511627907,
|
| 210 |
+
"grad_norm": 0.9242410659790039,
|
| 211 |
+
"learning_rate": 1.1299687316133256e-05,
|
| 212 |
+
"loss": 1.7234,
|
| 213 |
+
"step": 290
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 13.651162790697674,
|
| 217 |
+
"grad_norm": 0.9648193717002869,
|
| 218 |
+
"learning_rate": 6.69328333505567e-06,
|
| 219 |
+
"loss": 1.7062,
|
| 220 |
+
"step": 300
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"logging_steps": 10,
|
| 224 |
+
"max_steps": 330,
|
| 225 |
+
"num_input_tokens_seen": 0,
|
| 226 |
+
"num_train_epochs": 15,
|
| 227 |
+
"save_steps": 500,
|
| 228 |
+
"stateful_callbacks": {
|
| 229 |
+
"TrainerControl": {
|
| 230 |
+
"args": {
|
| 231 |
+
"should_epoch_stop": false,
|
| 232 |
+
"should_evaluate": false,
|
| 233 |
+
"should_log": false,
|
| 234 |
+
"should_save": true,
|
| 235 |
+
"should_training_stop": false
|
| 236 |
+
},
|
| 237 |
+
"attributes": {}
|
| 238 |
+
}
|
| 239 |
+
},
|
| 240 |
+
"total_flos": 664107774640128.0,
|
| 241 |
+
"train_batch_size": 2,
|
| 242 |
+
"trial_name": null,
|
| 243 |
+
"trial_params": null
|
| 244 |
+
}
|
checkpoint-308/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bae7c5af6a706a72279095930dcc3fbcc986566e61c4724b772fc668d39a80bf
|
| 3 |
+
size 5368
|
checkpoint-308/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-330/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: distilgpt2
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:distilgpt2
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.17.1
|
checkpoint-330/adapter_config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "distilgpt2",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": true,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 64,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.05,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"qalora_group_size": 16,
|
| 24 |
+
"r": 32,
|
| 25 |
+
"rank_pattern": {},
|
| 26 |
+
"revision": null,
|
| 27 |
+
"target_modules": [
|
| 28 |
+
"c_fc",
|
| 29 |
+
"c_attn",
|
| 30 |
+
"c_proj"
|
| 31 |
+
],
|
| 32 |
+
"target_parameters": null,
|
| 33 |
+
"task_type": "CAUSAL_LM",
|
| 34 |
+
"trainable_token_indices": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_qalora": false,
|
| 37 |
+
"use_rslora": false
|
| 38 |
+
}
|
checkpoint-330/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:288109ee817e5803863f9be6ca69a36eaf792db46082195c13ddf4bd8aadf6f6
|
| 3 |
+
size 9443272
|
checkpoint-330/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-330/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a32458c8ef26d40b6d4839d3e26dad935b05947a52a533aaba0209276f2b4be
|
| 3 |
+
size 18913786
|
checkpoint-330/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29780a39118ee3613ab342725dd956bb6fcd9571e3d8036e70fea20442a7ecd2
|
| 3 |
+
size 13990
|
checkpoint-330/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b22894a29a9b34fbc993eddcbeb56a994c2ac34ea81f73f447368a772a6136ff
|
| 3 |
+
size 1064
|
checkpoint-330/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
checkpoint-330/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-330/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
checkpoint-330/trainer_state.json
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 15.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 330,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.46511627906976744,
|
| 14 |
+
"grad_norm": 0.8585774898529053,
|
| 15 |
+
"learning_rate": 2.25e-06,
|
| 16 |
+
"loss": 4.6803,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.9302325581395349,
|
| 21 |
+
"grad_norm": 0.6986469626426697,
|
| 22 |
+
"learning_rate": 4.75e-06,
|
| 23 |
+
"loss": 4.6484,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 1.372093023255814,
|
| 28 |
+
"grad_norm": 0.9079183340072632,
|
| 29 |
+
"learning_rate": 7.25e-06,
|
| 30 |
+
"loss": 4.692,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 1.8372093023255816,
|
| 35 |
+
"grad_norm": 0.9651162624359131,
|
| 36 |
+
"learning_rate": 9.750000000000002e-06,
|
| 37 |
+
"loss": 4.5727,
|
| 38 |
+
"step": 40
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 2.2790697674418605,
|
| 42 |
+
"grad_norm": 1.0011574029922485,
|
| 43 |
+
"learning_rate": 1.225e-05,
|
| 44 |
+
"loss": 4.5349,
|
| 45 |
+
"step": 50
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 2.744186046511628,
|
| 49 |
+
"grad_norm": 0.8613724708557129,
|
| 50 |
+
"learning_rate": 1.475e-05,
|
| 51 |
+
"loss": 4.4834,
|
| 52 |
+
"step": 60
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 3.186046511627907,
|
| 56 |
+
"grad_norm": 0.7298063039779663,
|
| 57 |
+
"learning_rate": 1.725e-05,
|
| 58 |
+
"loss": 4.3632,
|
| 59 |
+
"step": 70
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 3.6511627906976747,
|
| 63 |
+
"grad_norm": 0.9772608876228333,
|
| 64 |
+
"learning_rate": 1.9750000000000002e-05,
|
| 65 |
+
"loss": 4.2663,
|
| 66 |
+
"step": 80
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 4.093023255813954,
|
| 70 |
+
"grad_norm": 0.9942960143089294,
|
| 71 |
+
"learning_rate": 2.2250000000000002e-05,
|
| 72 |
+
"loss": 4.0867,
|
| 73 |
+
"step": 90
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 4.558139534883721,
|
| 77 |
+
"grad_norm": 1.3665835857391357,
|
| 78 |
+
"learning_rate": 2.4750000000000002e-05,
|
| 79 |
+
"loss": 3.8939,
|
| 80 |
+
"step": 100
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 5.0,
|
| 84 |
+
"grad_norm": 1.0870517492294312,
|
| 85 |
+
"learning_rate": 2.725e-05,
|
| 86 |
+
"loss": 3.6812,
|
| 87 |
+
"step": 110
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 5.465116279069767,
|
| 91 |
+
"grad_norm": 1.2463775873184204,
|
| 92 |
+
"learning_rate": 2.975e-05,
|
| 93 |
+
"loss": 3.4444,
|
| 94 |
+
"step": 120
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 5.930232558139535,
|
| 98 |
+
"grad_norm": 1.569264531135559,
|
| 99 |
+
"learning_rate": 3.2250000000000005e-05,
|
| 100 |
+
"loss": 3.1378,
|
| 101 |
+
"step": 130
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 6.372093023255814,
|
| 105 |
+
"grad_norm": 1.720213532447815,
|
| 106 |
+
"learning_rate": 3.475e-05,
|
| 107 |
+
"loss": 2.925,
|
| 108 |
+
"step": 140
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 6.837209302325581,
|
| 112 |
+
"grad_norm": 1.5594395399093628,
|
| 113 |
+
"learning_rate": 3.7250000000000004e-05,
|
| 114 |
+
"loss": 2.5943,
|
| 115 |
+
"step": 150
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 7.27906976744186,
|
| 119 |
+
"grad_norm": 1.1726150512695312,
|
| 120 |
+
"learning_rate": 3.9750000000000004e-05,
|
| 121 |
+
"loss": 2.4226,
|
| 122 |
+
"step": 160
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 7.7441860465116275,
|
| 126 |
+
"grad_norm": 1.3921430110931396,
|
| 127 |
+
"learning_rate": 4.2250000000000004e-05,
|
| 128 |
+
"loss": 2.1763,
|
| 129 |
+
"step": 170
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 8.186046511627907,
|
| 133 |
+
"grad_norm": 1.3491045236587524,
|
| 134 |
+
"learning_rate": 4.4750000000000004e-05,
|
| 135 |
+
"loss": 2.1677,
|
| 136 |
+
"step": 180
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 8.651162790697674,
|
| 140 |
+
"grad_norm": 0.9162717461585999,
|
| 141 |
+
"learning_rate": 4.7249999999999997e-05,
|
| 142 |
+
"loss": 1.9753,
|
| 143 |
+
"step": 190
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 9.093023255813954,
|
| 147 |
+
"grad_norm": 1.0425995588302612,
|
| 148 |
+
"learning_rate": 4.975e-05,
|
| 149 |
+
"loss": 2.1817,
|
| 150 |
+
"step": 200
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 9.55813953488372,
|
| 154 |
+
"grad_norm": 0.9056143164634705,
|
| 155 |
+
"learning_rate": 4.9411026970731805e-05,
|
| 156 |
+
"loss": 1.9631,
|
| 157 |
+
"step": 210
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 10.0,
|
| 161 |
+
"grad_norm": 1.2772588729858398,
|
| 162 |
+
"learning_rate": 4.7410673432392596e-05,
|
| 163 |
+
"loss": 1.8078,
|
| 164 |
+
"step": 220
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 10.465116279069768,
|
| 168 |
+
"grad_norm": 1.1431177854537964,
|
| 169 |
+
"learning_rate": 4.410789301364621e-05,
|
| 170 |
+
"loss": 1.9238,
|
| 171 |
+
"step": 230
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 10.930232558139535,
|
| 175 |
+
"grad_norm": 0.9406613707542419,
|
| 176 |
+
"learning_rate": 3.9694631307311836e-05,
|
| 177 |
+
"loss": 1.7594,
|
| 178 |
+
"step": 240
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 11.372093023255815,
|
| 182 |
+
"grad_norm": 0.8469231724739075,
|
| 183 |
+
"learning_rate": 3.442737104220801e-05,
|
| 184 |
+
"loss": 1.9515,
|
| 185 |
+
"step": 250
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 11.837209302325581,
|
| 189 |
+
"grad_norm": 1.383780598640442,
|
| 190 |
+
"learning_rate": 2.8612226239230532e-05,
|
| 191 |
+
"loss": 1.6711,
|
| 192 |
+
"step": 260
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 12.279069767441861,
|
| 196 |
+
"grad_norm": 0.8472384214401245,
|
| 197 |
+
"learning_rate": 2.2587151977137122e-05,
|
| 198 |
+
"loss": 1.7611,
|
| 199 |
+
"step": 270
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 12.744186046511627,
|
| 203 |
+
"grad_norm": 0.8573427796363831,
|
| 204 |
+
"learning_rate": 1.6702303671786797e-05,
|
| 205 |
+
"loss": 1.734,
|
| 206 |
+
"step": 280
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 13.186046511627907,
|
| 210 |
+
"grad_norm": 0.9242410659790039,
|
| 211 |
+
"learning_rate": 1.1299687316133256e-05,
|
| 212 |
+
"loss": 1.7234,
|
| 213 |
+
"step": 290
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 13.651162790697674,
|
| 217 |
+
"grad_norm": 0.9648193717002869,
|
| 218 |
+
"learning_rate": 6.69328333505567e-06,
|
| 219 |
+
"loss": 1.7062,
|
| 220 |
+
"step": 300
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 14.093023255813954,
|
| 224 |
+
"grad_norm": 0.9734258651733398,
|
| 225 |
+
"learning_rate": 3.1507991843370526e-06,
|
| 226 |
+
"loss": 1.7115,
|
| 227 |
+
"step": 310
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 14.55813953488372,
|
| 231 |
+
"grad_norm": 0.8706108331680298,
|
| 232 |
+
"learning_rate": 8.781111664112162e-07,
|
| 233 |
+
"loss": 1.6092,
|
| 234 |
+
"step": 320
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 15.0,
|
| 238 |
+
"grad_norm": 1.459479808807373,
|
| 239 |
+
"learning_rate": 7.299647995176462e-09,
|
| 240 |
+
"loss": 1.7667,
|
| 241 |
+
"step": 330
|
| 242 |
+
}
|
| 243 |
+
],
|
| 244 |
+
"logging_steps": 10,
|
| 245 |
+
"max_steps": 330,
|
| 246 |
+
"num_input_tokens_seen": 0,
|
| 247 |
+
"num_train_epochs": 15,
|
| 248 |
+
"save_steps": 500,
|
| 249 |
+
"stateful_callbacks": {
|
| 250 |
+
"TrainerControl": {
|
| 251 |
+
"args": {
|
| 252 |
+
"should_epoch_stop": false,
|
| 253 |
+
"should_evaluate": false,
|
| 254 |
+
"should_log": false,
|
| 255 |
+
"should_save": true,
|
| 256 |
+
"should_training_stop": true
|
| 257 |
+
},
|
| 258 |
+
"attributes": {}
|
| 259 |
+
}
|
| 260 |
+
},
|
| 261 |
+
"total_flos": 711544044257280.0,
|
| 262 |
+
"train_batch_size": 2,
|
| 263 |
+
"trial_name": null,
|
| 264 |
+
"trial_params": null
|
| 265 |
+
}
|
checkpoint-330/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bae7c5af6a706a72279095930dcc3fbcc986566e61c4724b772fc668d39a80bf
|
| 3 |
+
size 5368
|
checkpoint-330/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
training_info.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "distilgpt2",
|
| 3 |
+
"version": "maximum-accuracy",
|
| 4 |
+
"lora_r": 32,
|
| 5 |
+
"lora_alpha": 64,
|
| 6 |
+
"target_modules": [
|
| 7 |
+
"c_attn",
|
| 8 |
+
"c_proj",
|
| 9 |
+
"c_fc"
|
| 10 |
+
],
|
| 11 |
+
"lora_dropout": 0.05,
|
| 12 |
+
"murlis_used": 500,
|
| 13 |
+
"total_examples": 344,
|
| 14 |
+
"epochs": 15,
|
| 15 |
+
"max_length": 512,
|
| 16 |
+
"batch_size": 2,
|
| 17 |
+
"gradient_accumulation": 8,
|
| 18 |
+
"effective_batch_size": 16,
|
| 19 |
+
"learning_rate": 5e-05,
|
| 20 |
+
"warmup_steps": 200,
|
| 21 |
+
"scheduler": "cosine",
|
| 22 |
+
"weight_decay": 0.02,
|
| 23 |
+
"completed_at": "2025-10-03T12:25:52.051354",
|
| 24 |
+
"improvements": [
|
| 25 |
+
"LoRA Rank: 32 (8x from standard, 2x from enhanced)",
|
| 26 |
+
"LoRA Alpha: 64 (8x from standard, 2x from enhanced)",
|
| 27 |
+
"Target Modules: c_attn + c_proj + c_fc (ALL layers)",
|
| 28 |
+
"Epochs: 15 (5x from standard, 1.5x from enhanced)",
|
| 29 |
+
"Murlis: 500 (3.3x from standard, 1.67x from enhanced)",
|
| 30 |
+
"Context: 512 tokens (2x from standard, 1.33x from enhanced)",
|
| 31 |
+
"15 detailed spiritual concepts with full explanations",
|
| 32 |
+
"7 different formats per murli for comprehensive learning",
|
| 33 |
+
"Ultra-careful learning rate (5e-5)",
|
| 34 |
+
"Maximum warmup (200 steps)",
|
| 35 |
+
"Larger effective batch (16)",
|
| 36 |
+
"Stronger regularization (0.02 weight decay)"
|
| 37 |
+
]
|
| 38 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|