File size: 5,880 Bytes
507a87e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import torch
import torch.nn as nn
import yaml
from transformers import GPT2Config, PreTrainedTokenizerFast
from huggingface_hub import HfApi, upload_file
import os
import json
import sentencepiece as spm
from tokenizers import SentencePieceBPETokenizer

# Define the TinyLlama Model
class TinyLlama(nn.Module):
    def __init__(self, config):
        super(TinyLlama, self).__init__()
        self.embedding = nn.Embedding(config.vocab_size, config.n_embd)
        self.transformer_blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=config.n_embd,
                nhead=config.n_head,
                dim_feedforward=config.hidden_dim
            )
            for _ in range(config.n_layer)
        ])
        self.output_layer = nn.Linear(config.n_embd, config.vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        for block in self.transformer_blocks:
            x = block(x)
        x = self.output_layer(x)
        return x

# Load Model Configuration from YAML file
model_config_path = "/home/jax/out/custom-model/final/model_config.yaml"
with open(model_config_path, 'r') as file:
    config_data = yaml.safe_load(file)

# Create Model Configuration
config = GPT2Config(
    vocab_size=config_data.get("vocab_size", 32000),    # Adjust to match TinyLlama's vocab size
    n_embd=config_data.get("n_embd", 2048),             # Embedding size for TinyLlama
    n_layer=config_data.get("n_layer", 24),             # Number of transformer layers
    n_head=config_data.get("n_head", 16),               # Number of attention heads
    hidden_dim=config_data.get("hidden_dim", 8192)      # Feedforward layer dimension
)

# Instantiate the TinyLlama Model
model = TinyLlama(config)

# Load Weights from the .pth file
model_weights_path = "/home/jax/out/custom-model/final/lit_model.pth"
model_weights = torch.load(model_weights_path, map_location=torch.device('cpu'), weights_only=True)
model.load_state_dict(model_weights, strict=False)  # strict=False to allow for minor mismatches

# Directory to Save the Model and Tokenizer for Hugging Face
model_dir = "./huggingface_tinyllama"
os.makedirs(model_dir, exist_ok=True)

# Save the Model Weights
model_weights_save_path = os.path.join(model_dir, "pytorch_model.bin")
torch.save(model.state_dict(), model_weights_save_path)

# Save the Configuration in JSON format
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, 'w') as f:
    json.dump(config.to_dict(), f)

# Load and Convert the SentencePiece model to Hugging Face-compatible format
# Load the SentencePiece model
tokenizer_path = "/home/jax/out/custom-model/final/tokenizer.model"
sp_tokenizer = spm.SentencePieceProcessor()
sp_tokenizer.load(tokenizer_path)

# Create a new tokenizer using the tokenizers library
hf_tokenizer = SentencePieceBPETokenizer(
    vocab=tokenizer_path,  # Load SentencePiece model
    add_prefix_space=True
)

# Train the tokenizer if needed (optional step, depending on your model and usage)
# hf_tokenizer.train([tokenizer_path])

# Set special tokens
hf_tokenizer.add_special_tokens(["<unk>", "<pad>", "<s>", "</s>"])

# Wrap the tokenizer with PreTrainedTokenizerFast for Hugging Face compatibility
hf_tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)

# Save the Hugging Face tokenizer
hf_tokenizer.save_pretrained(model_dir)

# Push to Hugging Face Hub
api.create_repo(repo_id=repo_id, token=os.getenv('HUGGINGFACE_API_TOKEN'), exist_ok=True)

# Create a Model Card for the Model
model_card = """
---
language: en
tags:
- tinyllama
- language-model
- chat
license: apache-2.0
---

# TinyLlama 1.1B Chat Model

## Model Description
TinyLlama is a lightweight LLaMA-based model with 1.1 billion parameters, designed to perform well on conversational and text generation tasks. It has been fine-tuned specifically for chat applications, providing coherent and context-aware responses.

## Training Data
The model was trained on a diverse dataset, including web text, books, and conversational data, to make it capable of handling a wide range of language styles.

## Usage
You can use this model for conversational AI, text completion, or other natural language generation tasks. Here’s a quick example:

```python
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("jacksonstrut/tinyllama-1.1B-chat")
model = AutoModelForCausalLM.from_pretrained("jacksonstrut/tinyllama-1.1B-chat")

input_ids = tokenizer("Hello, how are you?", return_tensors="pt").input_ids
output = model.generate(input_ids)
print(tokenizer.decode(output[0]))
```

## Limitations
- The model may produce biased or inappropriate outputs as it is trained on general datasets from the internet.
- It may not be suitable for all applications, especially those requiring factual accuracy.

## License
This model is licensed under the Apache 2.0 License.
"""

# Save the Model Card to README.md
readme_path = os.path.join(model_dir, "README.md")
with open(readme_path, 'w') as f:
    f.write(model_card)

# Upload the Model Card to Hugging Face Hub

# Upload files to Hugging Face Hub
upload_file(
    path_or_fileobj=model_weights_save_path,
    path_in_repo="pytorch_model.bin",
    repo_id=repo_id,
    token=os.getenv('HUGGINGFACE_API_TOKEN')
)
upload_file(
    path_or_fileobj=config_save_path,
    path_in_repo="config.json",
    repo_id=repo_id,
    token=os.getenv('HUGGINGFACE_API_TOKEN')
)

# Upload the tokenizer files
upload_file(
    path_or_fileobj=os.path.join(model_dir, "tokenizer.model"),
    path_in_repo="tokenizer.model",
    repo_id=repo_id,
    token=os.getenv('HUGGINGFACE_API_TOKEN')
)
upload_file(
    path_or_fileobj=os.path.join(model_dir, "tokenizer_config.json"),
    path_in_repo="tokenizer_config.json",
    repo_id=repo_id,
    token=os.getenv('HUGGINGFACE_API_TOKEN')
)