PhysiQuanty commited on
Commit
d0ecd4a
·
verified ·
1 Parent(s): 7d41cd3

inference-ready export

Browse files
README.md CHANGED
@@ -1,3 +1,8 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
1
+ # BinaryLLM (HF export)
2
+
3
+ Tokenizer-free / base-N model export.
4
+
5
+ ## Load
6
+ ```python
7
+ from transformers import AutoModelForCausalLM
8
+ m = AutoModelForCausalLM.from_pretrained("./hf_binaryllm_repo", trust_remote_code=True)
__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .configuration_binaryllm import BinaryLLMConfig
2
+ from .modeling_binaryllm import BinaryLLMForCausalLM
config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "binaryllm",
3
+ "architectures": [
4
+ "BinaryLLMForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_binaryllm.BinaryLLMConfig",
8
+ "AutoModelForCausalLM": "modeling_binaryllm.BinaryLLMForCausalLM"
9
+ },
10
+ "vocab_size": 4,
11
+ "hidden_size": 384,
12
+ "num_hidden_layers": 6,
13
+ "num_attention_heads": 6,
14
+ "intermediate_size": 1536,
15
+ "max_position_embeddings": 4096,
16
+ "dropout": 0.1,
17
+ "activation": "gelu",
18
+ "torch_dtype": "float32"
19
+ }
configuration_binaryllm.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class BinaryLLMConfig(PretrainedConfig):
5
+ model_type = "binaryllm"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size: int = 4,
10
+ hidden_size: int = 384,
11
+ num_hidden_layers: int = 6,
12
+ num_attention_heads: int = 6,
13
+ intermediate_size: int = 1536,
14
+ max_position_embeddings: int = 4096,
15
+ dropout: float = 0.1,
16
+ activation: str = "gelu",
17
+ **kwargs,
18
+ ):
19
+ self.vocab_size = int(vocab_size)
20
+ self.hidden_size = int(hidden_size)
21
+ self.num_hidden_layers = int(num_hidden_layers)
22
+ self.num_attention_heads = int(num_attention_heads)
23
+ self.intermediate_size = int(intermediate_size)
24
+ self.max_position_embeddings = int(max_position_embeddings)
25
+ self.dropout = float(dropout)
26
+ self.activation = str(activation)
27
+ super().__init__(**kwargs)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83cd04bbee5a84427aaef0411311563f07bfa0f72c892d2e1f24e4b912810816
3
+ size 42610624
modeling_binaryllm.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+ from transformers import PreTrainedModel
10
+ from transformers.modeling_outputs import CausalLMOutput
11
+
12
+ from .configuration_binaryllm import BinaryLLMConfig
13
+
14
+
15
+ class PositionalEncoding(nn.Module):
16
+ """
17
+ Sinusoidal positional encoding, stocké en fp32,
18
+ puis casté au dtype de x à chaque forward.
19
+ """
20
+
21
+ def __init__(self, d_model: int, max_len: int) -> None:
22
+ super().__init__()
23
+ pe = torch.zeros(max_len, d_model, dtype=torch.float32)
24
+ position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
25
+ div_term = torch.exp(
26
+ torch.arange(0, d_model, 2, dtype=torch.float32) * (-torch.log(torch.tensor(10000.0)) / d_model)
27
+ )
28
+ pe[:, 0::2] = torch.sin(position * div_term)
29
+ pe[:, 1::2] = torch.cos(position * div_term)
30
+ pe = pe.unsqueeze(0) # (1, max_len, d_model)
31
+ self.register_buffer("pe", pe, persistent=False)
32
+
33
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
34
+ t = x.size(1)
35
+ pe = self.pe[:, :t, :]
36
+ pe = pe.to(device=x.device, dtype=x.dtype)
37
+ return x + pe
38
+
39
+
40
+ @dataclass
41
+ class _InnerCfg:
42
+ block_size: int
43
+ embed_dim: int
44
+ vocab_size: int
45
+ num_heads: int
46
+ num_layers: int
47
+ ff_hidden_dim: int
48
+ dropout: float
49
+ layernorm_dim: Optional[int] = None
50
+ head_dim: Optional[int] = None
51
+
52
+
53
+ class TinyTransformerLM(nn.Module):
54
+ def __init__(self, cfg: _InnerCfg) -> None:
55
+ super().__init__()
56
+ self.cfg = cfg
57
+
58
+ vocab_size = cfg.vocab_size
59
+ self.tok_embed = nn.Embedding(vocab_size, cfg.embed_dim)
60
+ self.pos_encoding = PositionalEncoding(cfg.embed_dim, cfg.block_size)
61
+
62
+ encoder_layer = nn.TransformerEncoderLayer(
63
+ d_model=cfg.embed_dim,
64
+ nhead=cfg.num_heads,
65
+ dim_feedforward=cfg.ff_hidden_dim,
66
+ dropout=cfg.dropout,
67
+ activation="gelu",
68
+ batch_first=True,
69
+ )
70
+ self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=cfg.num_layers)
71
+
72
+ ln_dim = cfg.layernorm_dim or cfg.embed_dim
73
+ head_dim = cfg.head_dim or ln_dim
74
+
75
+ self.pre_ln_proj: Optional[nn.Linear] = None
76
+ if ln_dim != cfg.embed_dim:
77
+ self.pre_ln_proj = nn.Linear(cfg.embed_dim, ln_dim)
78
+
79
+ self.ln = nn.LayerNorm(ln_dim)
80
+
81
+ self.head_pre: Optional[nn.Linear] = None
82
+ if head_dim != ln_dim:
83
+ self.head_pre = nn.Linear(ln_dim, head_dim)
84
+
85
+ self.head = nn.Linear(head_dim, vocab_size, bias=False)
86
+
87
+ # weight tying seulement si parfait alignement
88
+ if self.pre_ln_proj is None and self.head_pre is None and head_dim == cfg.embed_dim:
89
+ self.head.weight = self.tok_embed.weight
90
+
91
+ causal = torch.triu(torch.ones(cfg.block_size, cfg.block_size, dtype=torch.bool), diagonal=1)
92
+ self.register_buffer("causal_mask", causal, persistent=False)
93
+
94
+ def forward(self, tokens: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
95
+ x = self.tok_embed(tokens)
96
+ x = self.pos_encoding(x)
97
+
98
+ seq_len = tokens.size(1)
99
+ attn_mask = self.causal_mask[:seq_len, :seq_len].to(device=tokens.device)
100
+
101
+ if padding_mask is not None:
102
+ padding_mask = padding_mask[:, :seq_len].to(device=tokens.device, dtype=torch.bool)
103
+
104
+ x = self.encoder(x, mask=attn_mask, src_key_padding_mask=padding_mask)
105
+
106
+ if self.pre_ln_proj is not None:
107
+ x = self.pre_ln_proj(x)
108
+
109
+ x = self.ln(x)
110
+
111
+ if self.head_pre is not None:
112
+ x = self.head_pre(x)
113
+
114
+ return self.head(x)
115
+
116
+
117
+ class BinaryLLMForCausalLM(PreTrainedModel):
118
+ config_class = BinaryLLMConfig
119
+ main_input_name = "input_ids"
120
+
121
+ def __init__(self, config: BinaryLLMConfig):
122
+ super().__init__(config)
123
+
124
+ inner = _InnerCfg(
125
+ block_size=int(config.max_position_embeddings),
126
+ embed_dim=int(config.hidden_size),
127
+ vocab_size=int(config.vocab_size),
128
+ num_heads=int(config.num_attention_heads),
129
+ num_layers=int(config.num_hidden_layers),
130
+ ff_hidden_dim=int(config.intermediate_size),
131
+ dropout=float(getattr(config, "dropout", 0.0)),
132
+ layernorm_dim=None,
133
+ head_dim=None,
134
+ )
135
+ self.model = TinyTransformerLM(inner)
136
+
137
+ self.post_init()
138
+
139
+ def forward(
140
+ self,
141
+ input_ids: torch.LongTensor,
142
+ attention_mask: Optional[torch.Tensor] = None,
143
+ labels: Optional[torch.LongTensor] = None,
144
+ **kwargs,
145
+ ) -> CausalLMOutput:
146
+ padding_mask = None
147
+ if attention_mask is not None:
148
+ padding_mask = ~attention_mask.to(torch.bool) # True = ignore
149
+
150
+ logits = self.model(input_ids, padding_mask=padding_mask)
151
+
152
+ loss = None
153
+ if labels is not None:
154
+ shift_logits = logits[:, :-1, :].contiguous()
155
+ shift_labels = labels[:, 1:].contiguous()
156
+ loss = F.cross_entropy(
157
+ shift_logits.view(-1, self.config.vocab_size),
158
+ shift_labels.view(-1),
159
+ ignore_index=-100,
160
+ )
161
+
162
+ return CausalLMOutput(loss=loss, logits=logits)