Direct upload
Browse files- bucket_memory_model.py +28 -7
- model.safetensors +2 -2
bucket_memory_model.py
CHANGED
|
@@ -243,22 +243,26 @@ class BucketMemoryTransformerLayer(nn.Module):
|
|
| 243 |
return x
|
| 244 |
|
| 245 |
|
| 246 |
-
|
| 247 |
# Updated model with HuggingFace compatibility
|
| 248 |
class BucketMemoryModel(PreTrainedModel):
|
| 249 |
-
config_class = BucketMemoryConfig
|
| 250 |
base_model_prefix = "bucket-memory-model2"
|
| 251 |
def __init__(self, config, adapter_kwargs=None):
|
| 252 |
super().__init__(config)
|
| 253 |
self.d_model = config.d_model
|
| 254 |
self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
self.pos_encoding = nn.Parameter(torch.zeros(1, config.max_seq_length, config.d_model))
|
| 256 |
self._init_positional_encoding(config.max_seq_length, config.d_model)
|
| 257 |
|
| 258 |
-
|
| 259 |
-
num_heads = getattr(config, 'num_attention_heads', config.d_model // 64)
|
| 260 |
-
num_heads = max(1, num_heads) # Ensure at least 1 head
|
| 261 |
-
|
| 262 |
self.layers = nn.ModuleList([
|
| 263 |
BucketMemoryTransformerLayer(
|
| 264 |
d_model=config.d_model,
|
|
@@ -287,10 +291,11 @@ class BucketMemoryModel(PreTrainedModel):
|
|
| 287 |
def forward(self, input_ids, attention_mask=None, labels=None):
|
| 288 |
batch_size, seq_len = input_ids.size()
|
| 289 |
x = self.token_embedding(input_ids) * np.sqrt(self.d_model)
|
|
|
|
|
|
|
| 290 |
x = x + self.pos_encoding[:, :seq_len]
|
| 291 |
x = self.dropout(x)
|
| 292 |
|
| 293 |
-
# Process through transformer layers
|
| 294 |
for layer in self.layers:
|
| 295 |
x = layer(x, attention_mask)
|
| 296 |
|
|
@@ -303,6 +308,22 @@ class BucketMemoryModel(PreTrainedModel):
|
|
| 303 |
return type('ModelOutput', (), {'loss': loss, 'logits': logits})
|
| 304 |
return logits
|
| 305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
AutoConfig.register("bucket-memory-model3", BucketMemoryConfig)
|
| 307 |
AutoModel.register(BucketMemoryConfig, BucketMemoryModel)
|
| 308 |
BucketMemoryConfig.register_for_auto_class()
|
|
|
|
| 243 |
return x
|
| 244 |
|
| 245 |
|
|
|
|
| 246 |
# Updated model with HuggingFace compatibility
|
| 247 |
class BucketMemoryModel(PreTrainedModel):
|
| 248 |
+
config_class = BucketMemoryConfig
|
| 249 |
base_model_prefix = "bucket-memory-model2"
|
| 250 |
def __init__(self, config, adapter_kwargs=None):
|
| 251 |
super().__init__(config)
|
| 252 |
self.d_model = config.d_model
|
| 253 |
self.token_embedding = nn.Embedding(config.vocab_size, config.d_model)
|
| 254 |
+
|
| 255 |
+
# TAPE-style dynamic position encoding
|
| 256 |
+
self.tape_position_encoder = nn.Sequential(
|
| 257 |
+
nn.Linear(config.d_model, config.d_model),
|
| 258 |
+
nn.ReLU(),
|
| 259 |
+
nn.Linear(config.d_model, config.d_model)
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
self.pos_encoding = nn.Parameter(torch.zeros(1, config.max_seq_length, config.d_model))
|
| 263 |
self._init_positional_encoding(config.max_seq_length, config.d_model)
|
| 264 |
|
| 265 |
+
num_heads = max(1, getattr(config, 'num_attention_heads', config.d_model // 64))
|
|
|
|
|
|
|
|
|
|
| 266 |
self.layers = nn.ModuleList([
|
| 267 |
BucketMemoryTransformerLayer(
|
| 268 |
d_model=config.d_model,
|
|
|
|
| 291 |
def forward(self, input_ids, attention_mask=None, labels=None):
|
| 292 |
batch_size, seq_len = input_ids.size()
|
| 293 |
x = self.token_embedding(input_ids) * np.sqrt(self.d_model)
|
| 294 |
+
tape_pos = self.tape_position_encoder(x)
|
| 295 |
+
x = x + tape_pos
|
| 296 |
x = x + self.pos_encoding[:, :seq_len]
|
| 297 |
x = self.dropout(x)
|
| 298 |
|
|
|
|
| 299 |
for layer in self.layers:
|
| 300 |
x = layer(x, attention_mask)
|
| 301 |
|
|
|
|
| 308 |
return type('ModelOutput', (), {'loss': loss, 'logits': logits})
|
| 309 |
return logits
|
| 310 |
|
| 311 |
+
def generate(self, input_ids, max_length=50):
|
| 312 |
+
generated_tokens = input_ids
|
| 313 |
+
for _ in range(max_length):
|
| 314 |
+
logits = self.forward(generated_tokens)
|
| 315 |
+
# Handle both cases: when logits is a tensor or when it's a ModelOutput object
|
| 316 |
+
if hasattr(logits, 'logits'):
|
| 317 |
+
next_token_logits = logits.logits[:, -1, :]
|
| 318 |
+
else:
|
| 319 |
+
next_token_logits = logits[:, -1, :]
|
| 320 |
+
|
| 321 |
+
next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
|
| 322 |
+
generated_tokens = torch.cat((generated_tokens, next_token_id), dim=1)
|
| 323 |
+
|
| 324 |
+
if next_token_id.item() == self.config.eos_token_id:
|
| 325 |
+
break
|
| 326 |
+
return generated_tokens
|
| 327 |
AutoConfig.register("bucket-memory-model3", BucketMemoryConfig)
|
| 328 |
AutoModel.register(BucketMemoryConfig, BucketMemoryModel)
|
| 329 |
BucketMemoryConfig.register_for_auto_class()
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eac09ba151cbcef373c847cabad46af2a3cde38d98403438e949c4b88f8ae061
|
| 3 |
+
size 412592100
|