Azrail commited on
Commit
889071d
·
verified ·
1 Parent(s): 24fb7a8

Training in progress, step 31000

Browse files
Files changed (5) hide show
  1. README.md +1 -0
  2. config.json +1 -5
  3. model.py +0 -2
  4. model.safetensors +1 -1
  5. training_args.bin +2 -2
README.md CHANGED
@@ -1,6 +1,7 @@
1
  ---
2
  library_name: transformers
3
  tags:
 
4
  - generated_from_trainer
5
  model-index:
6
  - name: smallm_140_rope
 
1
  ---
2
  library_name: transformers
3
  tags:
4
+ - smallm
5
  - generated_from_trainer
6
  model-index:
7
  - name: smallm_140_rope
config.json CHANGED
@@ -4,10 +4,6 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.1,
7
- "auto_map": {
8
- "AutoConfig": "config.SmalLmConfig",
9
- "AutoModelForCausalLM": "model.SmalLmForCausalLM"
10
- },
11
  "balancing_coef": 0.0001,
12
  "bos_token_id": 1,
13
  "embedding_dropout": 0.0,
@@ -43,7 +39,7 @@
43
  "sliding_window_attention": true,
44
  "sliding_window_context": 1024,
45
  "sliding_window_period": 4,
46
- "static_residual": true,
47
  "token_experts": 3,
48
  "torch_dtype": "float32",
49
  "transformers_version": "4.50.3",
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.1,
 
 
 
 
7
  "balancing_coef": 0.0001,
8
  "bos_token_id": 1,
9
  "embedding_dropout": 0.0,
 
39
  "sliding_window_attention": true,
40
  "sliding_window_context": 1024,
41
  "sliding_window_period": 4,
42
+ "static_residual": false,
43
  "token_experts": 3,
44
  "torch_dtype": "float32",
45
  "transformers_version": "4.50.3",
model.py CHANGED
@@ -635,8 +635,6 @@ class SmalLmModel(SmalLmPreTrainedModel):
635
  cache_position: Optional[torch.Tensor],
636
  ):
637
  if USE_FLASH and inputs_embeds.is_cuda:
638
- if attention_mask is None:
639
- attention_mask = torch.ones(*inputs_embeds.shape[:2], device=inputs_embeds.device)
640
  return attention_mask
641
  dtype, device = inputs_embeds.dtype, inputs_embeds.device
642
  past_token = (
 
635
  cache_position: Optional[torch.Tensor],
636
  ):
637
  if USE_FLASH and inputs_embeds.is_cuda:
 
 
638
  return attention_mask
639
  dtype, device = inputs_embeds.dtype, inputs_embeds.device
640
  past_token = (
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc13ffa23a1f5210f44d10669aa87f3ec7bfb7a2664786f76ce56132b042639e
3
  size 563074920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5bbb33796637d85d181dd86914f0d0b2932daf04a02e2d42b0e675ffd28388a
3
  size 563074920
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c535587179e528588509a5683a599c692165045d10114ebf77f1f94172c77e9
3
- size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37300a576f29a5a8ddf81ea75e13d6c1ee5bf582f11fc6860569d8fcc97499d1
3
+ size 6008