ronnengmail commited on
Commit
3140dfa
·
verified ·
1 Parent(s): 48488d4

Upload folder using huggingface_hub

Browse files
__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .configuration_hebrewgpt import HebrewGPTConfig
2
+ from .modeling_hebrewgpt import HebrewGPTForCausalLM
config.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
- "architectures": ["HebrewGPT"],
3
  "model_type": "hebrew-gpt",
 
 
 
 
 
4
  "vocab_size": 32000,
5
  "hidden_size": 2048,
6
  "num_hidden_layers": 20,
@@ -8,13 +13,8 @@
8
  "head_dim": 128,
9
  "intermediate_size": 5504,
10
  "max_position_embeddings": 2048,
11
- "dropout": 0.1,
12
- "activation": "silu",
13
- "norm_type": "rmsnorm",
14
  "rope_theta": 10000.0,
15
  "tie_word_embeddings": true,
16
- "torch_dtype": "bfloat16",
17
- "auto_map": {
18
- "AutoModel": "generate.HebrewGPT"
19
- }
20
  }
 
1
  {
2
+ "architectures": ["HebrewGPTForCausalLM"],
3
  "model_type": "hebrew-gpt",
4
+ "auto_map": {
5
+ "AutoConfig": "configuration_hebrewgpt.HebrewGPTConfig",
6
+ "AutoModel": "modeling_hebrewgpt.HebrewGPTForCausalLM",
7
+ "AutoModelForCausalLM": "modeling_hebrewgpt.HebrewGPTForCausalLM"
8
+ },
9
  "vocab_size": 32000,
10
  "hidden_size": 2048,
11
  "num_hidden_layers": 20,
 
13
  "head_dim": 128,
14
  "intermediate_size": 5504,
15
  "max_position_embeddings": 2048,
16
+ "dropout": 0.0,
 
 
17
  "rope_theta": 10000.0,
18
  "tie_word_embeddings": true,
19
+ "torch_dtype": "bfloat16"
 
 
 
20
  }
configuration_hebrewgpt.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HebrewGPT configuration."""
2
+
3
+ from transformers import PretrainedConfig
4
+
5
+
6
+ class HebrewGPTConfig(PretrainedConfig):
7
+ model_type = "hebrew-gpt"
8
+
9
+ def __init__(
10
+ self,
11
+ vocab_size=32000,
12
+ hidden_size=2048,
13
+ num_hidden_layers=20,
14
+ num_attention_heads=16,
15
+ head_dim=128,
16
+ intermediate_size=5504,
17
+ max_position_embeddings=2048,
18
+ dropout=0.0,
19
+ rope_theta=10000.0,
20
+ tie_word_embeddings=True,
21
+ **kwargs,
22
+ ):
23
+ self.hidden_size = hidden_size
24
+ self.num_hidden_layers = num_hidden_layers
25
+ self.num_attention_heads = num_attention_heads
26
+ self.head_dim = head_dim
27
+ self.intermediate_size = intermediate_size
28
+ self.max_position_embeddings = max_position_embeddings
29
+ self.dropout = dropout
30
+ self.rope_theta = rope_theta
31
+ super().__init__(
32
+ vocab_size=vocab_size,
33
+ tie_word_embeddings=tie_word_embeddings,
34
+ **kwargs,
35
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e509131f35ae113591ec21740632dfede6367f2e58ecc9b9334ca483e024561e
3
+ size 4309997264
modeling_hebrewgpt.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HebrewGPT model implementation compatible with HuggingFace AutoModel."""
2
+
3
+ import math
4
+ from typing import Optional, Tuple
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from transformers import PreTrainedModel
10
+ from transformers.modeling_outputs import CausalLMOutputWithPast
11
+
12
+ from .configuration_hebrewgpt import HebrewGPTConfig
13
+
14
+
15
+ class RMSNorm(nn.Module):
16
+ def __init__(self, dim: int, eps: float = 1e-6):
17
+ super().__init__()
18
+ self.eps = eps
19
+ self.weight = nn.Parameter(torch.ones(dim))
20
+
21
+ def forward(self, x):
22
+ norm = torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
23
+ return (x.float() * norm).type_as(x) * self.weight
24
+
25
+
26
+ def precompute_freqs_cis(dim: int, seq_len: int, theta: float = 10000.0):
27
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
28
+ t = torch.arange(seq_len, dtype=torch.float32)
29
+ freqs = torch.outer(t, freqs)
30
+ return torch.cos(freqs), torch.sin(freqs)
31
+
32
+
33
+ def apply_rotary_emb(x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor):
34
+ """Apply RoPE with interleaved pattern: x[..., ::2], x[..., 1::2]."""
35
+ x_r = x[..., ::2]
36
+ x_i = x[..., 1::2]
37
+
38
+ # Reshape freqs for broadcasting: (seq_len, head_dim//2) -> (1, seq_len, 1, head_dim//2)
39
+ cos = freqs_cos.unsqueeze(0).unsqueeze(2)
40
+ sin = freqs_sin.unsqueeze(0).unsqueeze(2)
41
+
42
+ o_r = x_r * cos - x_i * sin
43
+ o_i = x_r * sin + x_i * cos
44
+
45
+ # Interleave back
46
+ out = torch.stack((o_r, o_i), dim=-1).flatten(-2)
47
+ return out
48
+
49
+
50
+ class HebrewGPTAttention(nn.Module):
51
+ def __init__(self, config: HebrewGPTConfig):
52
+ super().__init__()
53
+ self.n_heads = config.num_attention_heads
54
+ self.head_dim = config.head_dim
55
+ self.hidden_size = config.hidden_size
56
+
57
+ self.qkv = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=False)
58
+ self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
59
+
60
+ # RoPE buffers - computed from config, not stored
61
+ freqs_cos, freqs_sin = precompute_freqs_cis(
62
+ config.head_dim, config.max_position_embeddings, config.rope_theta
63
+ )
64
+ self.register_buffer("freqs_cos", freqs_cos, persistent=False)
65
+ self.register_buffer("freqs_sin", freqs_sin, persistent=False)
66
+
67
+ def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
68
+ B, T, C = x.shape
69
+
70
+ qkv = self.qkv(x)
71
+ q, k, v = qkv.chunk(3, dim=-1)
72
+
73
+ q = q.view(B, T, self.n_heads, self.head_dim)
74
+ k = k.view(B, T, self.n_heads, self.head_dim)
75
+ v = v.view(B, T, self.n_heads, self.head_dim)
76
+
77
+ # Apply RoPE
78
+ q = apply_rotary_emb(q, self.freqs_cos[:T], self.freqs_sin[:T])
79
+ k = apply_rotary_emb(k, self.freqs_cos[:T], self.freqs_sin[:T])
80
+
81
+ # Transpose for attention: (B, n_heads, T, head_dim)
82
+ q = q.transpose(1, 2)
83
+ k = k.transpose(1, 2)
84
+ v = v.transpose(1, 2)
85
+
86
+ # Scaled dot-product attention with causal mask
87
+ y = F.scaled_dot_product_attention(
88
+ q, k, v, attn_mask=attention_mask, is_causal=(attention_mask is None)
89
+ )
90
+
91
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
92
+ return self.proj(y)
93
+
94
+
95
+ class HebrewGPTMLP(nn.Module):
96
+ def __init__(self, config: HebrewGPTConfig):
97
+ super().__init__()
98
+ self.gate = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
99
+ self.up = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
100
+ self.down = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
101
+
102
+ def forward(self, x):
103
+ return self.down(F.silu(self.gate(x)) * self.up(x))
104
+
105
+
106
+ class HebrewGPTBlock(nn.Module):
107
+ def __init__(self, config: HebrewGPTConfig):
108
+ super().__init__()
109
+ self.ln1 = RMSNorm(config.hidden_size)
110
+ self.attn = HebrewGPTAttention(config)
111
+ self.ln2 = RMSNorm(config.hidden_size)
112
+ self.mlp = HebrewGPTMLP(config)
113
+
114
+ def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
115
+ x = x + self.attn(self.ln1(x), attention_mask)
116
+ x = x + self.mlp(self.ln2(x))
117
+ return x
118
+
119
+
120
+ class HebrewGPTPreTrainedModel(PreTrainedModel):
121
+ config_class = HebrewGPTConfig
122
+ base_model_prefix = ""
123
+ supports_gradient_checkpointing = True
124
+ _no_split_modules = ["HebrewGPTBlock"]
125
+ _keys_to_ignore_on_load_missing = [r"blocks\.\d+\.attn\.freqs_cos", r"blocks\.\d+\.attn\.freqs_sin"]
126
+
127
+ def _init_weights(self, module):
128
+ if isinstance(module, nn.Linear):
129
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
130
+ if module.bias is not None:
131
+ torch.nn.init.zeros_(module.bias)
132
+ elif isinstance(module, nn.Embedding):
133
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
134
+
135
+
136
+ class HebrewGPTForCausalLM(HebrewGPTPreTrainedModel):
137
+ _tied_weights_keys = ["head.weight"]
138
+
139
+ def __init__(self, config: HebrewGPTConfig):
140
+ super().__init__(config)
141
+ self.tok_emb = nn.Embedding(config.vocab_size, config.hidden_size)
142
+ self.blocks = nn.ModuleList([HebrewGPTBlock(config) for _ in range(config.num_hidden_layers)])
143
+ self.ln_f = RMSNorm(config.hidden_size)
144
+ self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
145
+
146
+ # Tie weights
147
+ self.head.weight = self.tok_emb.weight
148
+
149
+ self.post_init()
150
+
151
+ def get_input_embeddings(self):
152
+ return self.tok_emb
153
+
154
+ def set_input_embeddings(self, value):
155
+ self.tok_emb = value
156
+ self.head.weight = self.tok_emb.weight
157
+
158
+ def get_output_embeddings(self):
159
+ return self.head
160
+
161
+ def set_output_embeddings(self, new_embeddings):
162
+ self.head = new_embeddings
163
+
164
+ def forward(
165
+ self,
166
+ input_ids: Optional[torch.LongTensor] = None,
167
+ attention_mask: Optional[torch.Tensor] = None,
168
+ labels: Optional[torch.LongTensor] = None,
169
+ past_key_values: Optional[Tuple] = None,
170
+ inputs_embeds: Optional[torch.FloatTensor] = None,
171
+ use_cache: Optional[bool] = None,
172
+ output_attentions: Optional[bool] = None,
173
+ output_hidden_states: Optional[bool] = None,
174
+ return_dict: Optional[bool] = None,
175
+ **kwargs,
176
+ ):
177
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
178
+
179
+ if inputs_embeds is None:
180
+ x = self.tok_emb(input_ids)
181
+ else:
182
+ x = inputs_embeds
183
+
184
+ # Convert attention_mask to the right format for SDPA if provided
185
+ attn_mask = None
186
+ if attention_mask is not None:
187
+ # attention_mask: (B, T) with 1s for real tokens, 0s for padding
188
+ B, T = attention_mask.shape
189
+ # Create causal + padding mask for SDPA
190
+ causal = torch.tril(torch.ones(T, T, device=x.device, dtype=torch.bool))
191
+ pad_mask = attention_mask[:, None, None, :].bool() # (B, 1, 1, T)
192
+ attn_mask = causal[None, None, :, :] & pad_mask # (B, 1, T, T)
193
+
194
+ for block in self.blocks:
195
+ x = block(x, attn_mask)
196
+
197
+ x = self.ln_f(x)
198
+ logits = self.head(x)
199
+
200
+ loss = None
201
+ if labels is not None:
202
+ shift_logits = logits[..., :-1, :].contiguous()
203
+ shift_labels = labels[..., 1:].contiguous()
204
+ loss = F.cross_entropy(
205
+ shift_logits.view(-1, shift_logits.size(-1)),
206
+ shift_labels.view(-1),
207
+ ignore_index=-100,
208
+ )
209
+
210
+ if not return_dict:
211
+ output = (logits,)
212
+ return (loss,) + output if loss is not None else output
213
+
214
+ return CausalLMOutputWithPast(
215
+ loss=loss,
216
+ logits=logits,
217
+ past_key_values=None,
218
+ hidden_states=None,
219
+ attentions=None,
220
+ )
221
+
222
+ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
223
+ return {
224
+ "input_ids": input_ids,
225
+ "attention_mask": attention_mask,
226
+ }
tokenizer_config.json CHANGED
@@ -1,11 +1,9 @@
1
  {
2
- "model_type": "sentencepiece",
3
- "sentencepiece_model_file": "tokenizer.model",
4
- "vocab_size": 32000,
5
  "bos_token": "<s>",
6
  "eos_token": "</s>",
7
  "unk_token": "<unk>",
8
  "pad_token": "<pad>",
9
- "model_max_length": 2048,
10
  "clean_up_tokenization_spaces": false
11
  }
 
1
  {
2
+ "tokenizer_class": "T5Tokenizer",
3
+ "model_max_length": 2048,
 
4
  "bos_token": "<s>",
5
  "eos_token": "</s>",
6
  "unk_token": "<unk>",
7
  "pad_token": "<pad>",
 
8
  "clean_up_tokenization_spaces": false
9
  }