updated for safetensors
Browse files
README.md
CHANGED
|
@@ -3,13 +3,17 @@ license: apache-2.0
|
|
| 3 |
tags:
|
| 4 |
- text-generation
|
| 5 |
- language-model
|
| 6 |
-
-
|
| 7 |
-
-
|
| 8 |
-
-
|
| 9 |
- transformer
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
language: en
|
| 11 |
datasets:
|
| 12 |
-
- CosmicSet-
|
| 13 |
- akkiisfrommars/TreeCorpusCleanedmodel
|
| 14 |
model_type: CosmicFish
|
| 15 |
pipeline_tag: text-generation
|
|
@@ -28,7 +32,7 @@ A 90M parameter language model with modern architecture improvements developed b
|
|
| 28 |
wget https://huggingface.co/MistyozAI/CosmicFish-90M/resolve/main/chat.py
|
| 29 |
|
| 30 |
# Install dependencies
|
| 31 |
-
pip install transformers huggingface-hub termcolor
|
| 32 |
|
| 33 |
# Run the chat interface (automatically downloads model)
|
| 34 |
python chat.py
|
|
@@ -42,16 +46,17 @@ The `chat.py` script handles all model loading, generation, and provides the bes
|
|
| 42 |
- **Architecture**: CosmicFish (RoPE, GQA, SwiGLU, RMSNorm)
|
| 43 |
- **Context Length**: 512 tokens
|
| 44 |
- **Vocabulary**: 50,257 tokens
|
| 45 |
-
- **Training Data**: CosmicSet
|
| 46 |
- **Developer**: Mistyoz AI
|
| 47 |
- **Repository**: MistyozAI/CosmicFish-90M
|
|
|
|
| 48 |
|
| 49 |
## Usage
|
| 50 |
|
| 51 |
### Installation
|
| 52 |
|
| 53 |
```bash
|
| 54 |
-
pip install transformers huggingface-hub termcolor
|
| 55 |
```
|
| 56 |
|
| 57 |
### Quick Chat Interface
|
|
@@ -59,6 +64,7 @@ pip install transformers huggingface-hub termcolor
|
|
| 59 |
```python
|
| 60 |
from transformers import GPT2Tokenizer
|
| 61 |
from huggingface_hub import snapshot_download
|
|
|
|
| 62 |
import torch
|
| 63 |
import json
|
| 64 |
import os
|
|
@@ -73,8 +79,8 @@ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
| 73 |
with open(os.path.join(cache_dir, "config.json")) as f:
|
| 74 |
config_dict = json.load(f)
|
| 75 |
|
| 76 |
-
# Load model weights
|
| 77 |
-
state_dict =
|
| 78 |
|
| 79 |
# Note: Full model class available in the repository
|
| 80 |
print("Model downloaded and ready for use!")
|
|
@@ -83,7 +89,7 @@ print("Model downloaded and ready for use!")
|
|
| 83 |
### Advanced Generation with Repetition Penalty
|
| 84 |
|
| 85 |
```python
|
| 86 |
-
def generate_with_repetition_penalty(model, tokenizer, prompt, max_tokens=100, temperature=0.
|
| 87 |
input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
|
| 88 |
generated = input_ids.clone()
|
| 89 |
|
|
@@ -112,6 +118,49 @@ def generate_with_repetition_penalty(model, tokenizer, prompt, max_tokens=100, t
|
|
| 112 |
return tokenizer.decode(generated[0], skip_special_tokens=True)
|
| 113 |
```
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
### Chat Interface
|
| 116 |
|
| 117 |
```python
|
|
@@ -154,21 +203,23 @@ CosmicFish uses several modern improvements over standard transformers:
|
|
| 154 |
|
| 155 |
## Training
|
| 156 |
|
| 157 |
-
- **Dataset**: CosmicSet
|
| 158 |
- **Sequence Length**: 512 tokens
|
| 159 |
-
- **Training Steps**: ~
|
| 160 |
- **Hardware**: Nvidia A40 x1
|
| 161 |
|
| 162 |
## Performance
|
| 163 |
|
| 164 |
- **Speed**: Varies by hardware (not benchmarked)
|
| 165 |
-
- **Memory**: ~
|
| 166 |
- **File Size**: 175MB
|
|
|
|
| 167 |
|
| 168 |
## Limitations
|
| 169 |
|
| 170 |
- Small model size (90M parameters) may produce less accurate responses
|
| 171 |
- 512 token context limit
|
|
|
|
| 172 |
- Training data cutoff applies
|
| 173 |
- May generate incorrect information
|
| 174 |
- Cannot browse internet or access real-time data
|
|
|
|
| 3 |
tags:
|
| 4 |
- text-generation
|
| 5 |
- language-model
|
| 6 |
+
- causal-lm
|
| 7 |
+
- cosmicfish
|
| 8 |
+
- 90m
|
| 9 |
- transformer
|
| 10 |
+
- rope
|
| 11 |
+
- gqa
|
| 12 |
+
- swiglu
|
| 13 |
+
- rmsnorm
|
| 14 |
language: en
|
| 15 |
datasets:
|
| 16 |
+
- CosmicSet-2.0-mini
|
| 17 |
- akkiisfrommars/TreeCorpusCleanedmodel
|
| 18 |
model_type: CosmicFish
|
| 19 |
pipeline_tag: text-generation
|
|
|
|
| 32 |
wget https://huggingface.co/MistyozAI/CosmicFish-90M/resolve/main/chat.py
|
| 33 |
|
| 34 |
# Install dependencies
|
| 35 |
+
pip install transformers huggingface-hub termcolor safetensors
|
| 36 |
|
| 37 |
# Run the chat interface (automatically downloads model)
|
| 38 |
python chat.py
|
|
|
|
| 46 |
- **Architecture**: CosmicFish (RoPE, GQA, SwiGLU, RMSNorm)
|
| 47 |
- **Context Length**: 512 tokens
|
| 48 |
- **Vocabulary**: 50,257 tokens
|
| 49 |
+
- **Training Data**: CosmicSet 2.0 mini
|
| 50 |
- **Developer**: Mistyoz AI
|
| 51 |
- **Repository**: MistyozAI/CosmicFish-90M
|
| 52 |
+
- **Format**: Safetensors
|
| 53 |
|
| 54 |
## Usage
|
| 55 |
|
| 56 |
### Installation
|
| 57 |
|
| 58 |
```bash
|
| 59 |
+
pip install transformers huggingface-hub termcolor safetensors
|
| 60 |
```
|
| 61 |
|
| 62 |
### Quick Chat Interface
|
|
|
|
| 64 |
```python
|
| 65 |
from transformers import GPT2Tokenizer
|
| 66 |
from huggingface_hub import snapshot_download
|
| 67 |
+
from safetensors.torch import load_file
|
| 68 |
import torch
|
| 69 |
import json
|
| 70 |
import os
|
|
|
|
| 79 |
with open(os.path.join(cache_dir, "config.json")) as f:
|
| 80 |
config_dict = json.load(f)
|
| 81 |
|
| 82 |
+
# Load model weights from safetensors
|
| 83 |
+
state_dict = load_file(os.path.join(cache_dir, "model.safetensors"))
|
| 84 |
|
| 85 |
# Note: Full model class available in the repository
|
| 86 |
print("Model downloaded and ready for use!")
|
|
|
|
| 89 |
### Advanced Generation with Repetition Penalty
|
| 90 |
|
| 91 |
```python
|
| 92 |
+
def generate_with_repetition_penalty(model, tokenizer, prompt, max_tokens=100, temperature=0.5, penalty=1.2):
|
| 93 |
input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
|
| 94 |
generated = input_ids.clone()
|
| 95 |
|
|
|
|
| 118 |
return tokenizer.decode(generated[0], skip_special_tokens=True)
|
| 119 |
```
|
| 120 |
|
| 121 |
+
### Loading Model with Safetensors
|
| 122 |
+
|
| 123 |
+
```python
|
| 124 |
+
from safetensors.torch import load_file
|
| 125 |
+
from modeling_cosmicfish import CosmicFish, CosmicConfig
|
| 126 |
+
import json
|
| 127 |
+
|
| 128 |
+
def load_cosmicfish_model(model_path):
|
| 129 |
+
# Load config
|
| 130 |
+
with open(os.path.join(model_path, "config.json")) as f:
|
| 131 |
+
config_dict = json.load(f)
|
| 132 |
+
|
| 133 |
+
# Create model config
|
| 134 |
+
config = CosmicConfig(
|
| 135 |
+
vocab_size=config_dict["vocab_size"],
|
| 136 |
+
block_size=config_dict["block_size"],
|
| 137 |
+
n_layer=config_dict["n_layer"],
|
| 138 |
+
n_head=config_dict["n_head"],
|
| 139 |
+
n_embd=config_dict["n_embd"],
|
| 140 |
+
bias=config_dict["bias"],
|
| 141 |
+
dropout=0.0,
|
| 142 |
+
use_rotary=config_dict["use_rotary"],
|
| 143 |
+
use_swiglu=config_dict["use_swiglu"],
|
| 144 |
+
use_gqa=config_dict["use_gqa"],
|
| 145 |
+
n_query_groups=config_dict["n_query_groups"]
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Create model
|
| 149 |
+
model = CosmicFish(config)
|
| 150 |
+
|
| 151 |
+
# Load weights from safetensors (secure format)
|
| 152 |
+
state_dict = load_file(os.path.join(model_path, "model.safetensors"))
|
| 153 |
+
|
| 154 |
+
# Handle weight sharing (lm_head.weight shares with transformer.wte.weight)
|
| 155 |
+
if 'lm_head.weight' not in state_dict and 'transformer.wte.weight' in state_dict:
|
| 156 |
+
state_dict['lm_head.weight'] = state_dict['transformer.wte.weight']
|
| 157 |
+
|
| 158 |
+
model.load_state_dict(state_dict)
|
| 159 |
+
model.eval()
|
| 160 |
+
|
| 161 |
+
return model
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
### Chat Interface
|
| 165 |
|
| 166 |
```python
|
|
|
|
| 203 |
|
| 204 |
## Training
|
| 205 |
|
| 206 |
+
- **Dataset**: CosmicSet 2.0 mini
|
| 207 |
- **Sequence Length**: 512 tokens
|
| 208 |
+
- **Training Steps**: ~250K iterations
|
| 209 |
- **Hardware**: Nvidia A40 x1
|
| 210 |
|
| 211 |
## Performance
|
| 212 |
|
| 213 |
- **Speed**: Varies by hardware (not benchmarked)
|
| 214 |
+
- **Memory**: ~256MB RAM
|
| 215 |
- **File Size**: 175MB
|
| 216 |
+
- **Loading**: Fast and secure with safetensors
|
| 217 |
|
| 218 |
## Limitations
|
| 219 |
|
| 220 |
- Small model size (90M parameters) may produce less accurate responses
|
| 221 |
- 512 token context limit
|
| 222 |
+
- English only
|
| 223 |
- Training data cutoff applies
|
| 224 |
- May generate incorrect information
|
| 225 |
- Cannot browse internet or access real-time data
|