File size: 5,880 Bytes
507a87e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import torch
import torch.nn as nn
import yaml
from transformers import GPT2Config, PreTrainedTokenizerFast
from huggingface_hub import HfApi, upload_file
import os
import json
import sentencepiece as spm
from tokenizers import SentencePieceBPETokenizer
# Define the TinyLlama Model
class TinyLlama(nn.Module):
def __init__(self, config):
super(TinyLlama, self).__init__()
self.embedding = nn.Embedding(config.vocab_size, config.n_embd)
self.transformer_blocks = nn.ModuleList([
nn.TransformerEncoderLayer(
d_model=config.n_embd,
nhead=config.n_head,
dim_feedforward=config.hidden_dim
)
for _ in range(config.n_layer)
])
self.output_layer = nn.Linear(config.n_embd, config.vocab_size)
def forward(self, x):
x = self.embedding(x)
for block in self.transformer_blocks:
x = block(x)
x = self.output_layer(x)
return x
# Load Model Configuration from YAML file
model_config_path = "/home/jax/out/custom-model/final/model_config.yaml"
with open(model_config_path, 'r') as file:
config_data = yaml.safe_load(file)
# Create Model Configuration
config = GPT2Config(
vocab_size=config_data.get("vocab_size", 32000), # Adjust to match TinyLlama's vocab size
n_embd=config_data.get("n_embd", 2048), # Embedding size for TinyLlama
n_layer=config_data.get("n_layer", 24), # Number of transformer layers
n_head=config_data.get("n_head", 16), # Number of attention heads
hidden_dim=config_data.get("hidden_dim", 8192) # Feedforward layer dimension
)
# Instantiate the TinyLlama Model
model = TinyLlama(config)
# Load Weights from the .pth file
model_weights_path = "/home/jax/out/custom-model/final/lit_model.pth"
model_weights = torch.load(model_weights_path, map_location=torch.device('cpu'), weights_only=True)
model.load_state_dict(model_weights, strict=False) # strict=False to allow for minor mismatches
# Directory to Save the Model and Tokenizer for Hugging Face
model_dir = "./huggingface_tinyllama"
os.makedirs(model_dir, exist_ok=True)
# Save the Model Weights
model_weights_save_path = os.path.join(model_dir, "pytorch_model.bin")
torch.save(model.state_dict(), model_weights_save_path)
# Save the Configuration in JSON format
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, 'w') as f:
json.dump(config.to_dict(), f)
# Load and Convert the SentencePiece model to Hugging Face-compatible format
# Load the SentencePiece model
tokenizer_path = "/home/jax/out/custom-model/final/tokenizer.model"
sp_tokenizer = spm.SentencePieceProcessor()
sp_tokenizer.load(tokenizer_path)
# Create a new tokenizer using the tokenizers library
hf_tokenizer = SentencePieceBPETokenizer(
vocab=tokenizer_path, # Load SentencePiece model
add_prefix_space=True
)
# Train the tokenizer if needed (optional step, depending on your model and usage)
# hf_tokenizer.train([tokenizer_path])
# Set special tokens
hf_tokenizer.add_special_tokens(["<unk>", "<pad>", "<s>", "</s>"])
# Wrap the tokenizer with PreTrainedTokenizerFast for Hugging Face compatibility
hf_tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
# Save the Hugging Face tokenizer
hf_tokenizer.save_pretrained(model_dir)
# Push to Hugging Face Hub
api.create_repo(repo_id=repo_id, token=os.getenv('HUGGINGFACE_API_TOKEN'), exist_ok=True)
# Create a Model Card for the Model
model_card = """
---
language: en
tags:
- tinyllama
- language-model
- chat
license: apache-2.0
---
# TinyLlama 1.1B Chat Model
## Model Description
TinyLlama is a lightweight LLaMA-based model with 1.1 billion parameters, designed to perform well on conversational and text generation tasks. It has been fine-tuned specifically for chat applications, providing coherent and context-aware responses.
## Training Data
The model was trained on a diverse dataset, including web text, books, and conversational data, to make it capable of handling a wide range of language styles.
## Usage
You can use this model for conversational AI, text completion, or other natural language generation tasks. Here’s a quick example:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("jacksonstrut/tinyllama-1.1B-chat")
model = AutoModelForCausalLM.from_pretrained("jacksonstrut/tinyllama-1.1B-chat")
input_ids = tokenizer("Hello, how are you?", return_tensors="pt").input_ids
output = model.generate(input_ids)
print(tokenizer.decode(output[0]))
```
## Limitations
- The model may produce biased or inappropriate outputs as it is trained on general datasets from the internet.
- It may not be suitable for all applications, especially those requiring factual accuracy.
## License
This model is licensed under the Apache 2.0 License.
"""
# Save the Model Card to README.md
readme_path = os.path.join(model_dir, "README.md")
with open(readme_path, 'w') as f:
f.write(model_card)
# Upload the Model Card to Hugging Face Hub
# Upload files to Hugging Face Hub
upload_file(
path_or_fileobj=model_weights_save_path,
path_in_repo="pytorch_model.bin",
repo_id=repo_id,
token=os.getenv('HUGGINGFACE_API_TOKEN')
)
upload_file(
path_or_fileobj=config_save_path,
path_in_repo="config.json",
repo_id=repo_id,
token=os.getenv('HUGGINGFACE_API_TOKEN')
)
# Upload the tokenizer files
upload_file(
path_or_fileobj=os.path.join(model_dir, "tokenizer.model"),
path_in_repo="tokenizer.model",
repo_id=repo_id,
token=os.getenv('HUGGINGFACE_API_TOKEN')
)
upload_file(
path_or_fileobj=os.path.join(model_dir, "tokenizer_config.json"),
path_in_repo="tokenizer_config.json",
repo_id=repo_id,
token=os.getenv('HUGGINGFACE_API_TOKEN')
)
|