pgryko commited on
Commit
5774ee2
·
verified ·
1 Parent(s): 4debf53

Upload Llama-10M-1M model

Browse files
README.md CHANGED
@@ -21,6 +21,12 @@ model_index:
21
  - type: perplexity
22
  value: N/A
23
  name: Perplexity
 
 
 
 
 
 
24
  ---
25
 
26
  # Llama-10M-1M
@@ -40,12 +46,25 @@ A 10M parameter LLaMA model trained on 1M synthetic tokens using the BabyLlama f
40
 
41
  ## Training Details
42
 
43
- - **Training Loss**: 2.5500883795998313
44
  - **Evaluation Loss**: N/A
45
  - **Perplexity**: N/A
46
  - **Learning Rate**: 3e-4
47
  - **Batch Size**: 32
48
  - **Epochs**: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  ## Usage
51
 
@@ -88,3 +107,23 @@ If you use this model in your research, please cite:
88
  ## License
89
 
90
  This model is released under the MIT License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  - type: perplexity
22
  value: N/A
23
  name: Perplexity
24
+ - type: loss
25
+ value: 2.499714469909668
26
+ name: Training Loss
27
+ - type: loss
28
+ value: N/A
29
+ name: Evaluation Loss
30
  ---
31
 
32
  # Llama-10M-1M
 
46
 
47
  ## Training Details
48
 
49
+ - **Training Loss**: 2.499714469909668
50
  - **Evaluation Loss**: N/A
51
  - **Perplexity**: N/A
52
  - **Learning Rate**: 3e-4
53
  - **Batch Size**: 32
54
  - **Epochs**: 2
55
+ - **Training Time**: 29.3597 seconds
56
+ - **Training Samples**: 3,519
57
+
58
+ ## Evaluation Metrics
59
+
60
+ | Metric | Value |
61
+ |--------|-------|
62
+ | Perplexity | N/A |
63
+ | Training Loss | 2.499714469909668 |
64
+ | Evaluation Loss | N/A |
65
+ | Training Time | 29.3597s |
66
+ | Parameters | 3,652,032 |
67
+ | Training Samples | 3,519 |
68
 
69
  ## Usage
70
 
 
107
  ## License
108
 
109
  This model is released under the MIT License.
110
+
111
+
112
+ ## Detailed Evaluation Results
113
+
114
+ ### Generation Quality Metrics
115
+ - **Diversity Score**: 0.932
116
+ - **Repetition Score**: 0.528 (lower is better)
117
+ - **Average Top Token Probability**: 0.356
118
+ - **Average Entropy**: 2.015
119
+ - **Low Confidence Ratio**: 0.791
120
+
121
+ ### Sample Generations
122
+ 1. "A child teaches slowly at the office, therefore the teacher writes happily. The bird reads thoughtfully in the garden. An artist writes carefully outside, afterwards the engineer explores eagerly. A child walks quickly in the park, meanwhile a writer creates sadly. A student"
123
+ 2. "The cat designs carefully at the library. A child jumps eagerly in the school, furthermore an artist learns thoughtfully. The engineer explores carefully in the school. The cat discovers eagerly on the street, and the scientist teaches quickly. The bird explores slowly in the"
124
+ 3. "The scientist teaches quickly in the park, however the engineer imagines creatively. A child thinks sadly in the lab, however a writer walks carefully. A dog writes sadly at the office. A dog explores patiently in the classroom. The engineer creates sadly in the"
125
+ 4. "A writer thinks sadly at the library. A writer reads carefully on the street, but the cat builds quickly. A student jumps patiently in the school. A student runs happily in the school, moreover a writer reads quickly. The cat creates brilliantly in the"
126
+ 5. "The engineer learns creatively at the office, afterwards a student runs quickly. The teacher thinks creatively in the school, and the scientist creates patiently. The scientist writes brilliantly in the lab, therefore the scientist designs brilliantly. A writer imagines creatively in the school."
127
+
128
+ ### Evaluation Plots
129
+ ![Evaluation Plots](evaluation_plots.png)
checkpoint-220/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e25793471728cb6b312aba089827e86543414bf3d7255639385ac7e6ad26f313
3
  size 14614216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44c4152579055e34dc5ec45941f07908ba6328eb43a9851f23d71b427b97b242
3
  size 14614216
checkpoint-220/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af90b61af45e8618189307f9f9e4e51f51f782d87d3ce371290a30e98f642236
3
  size 29264715
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae9e8f0c14808224676da1acdb2933f7088e30c97b867502269734e5fa06a9bc
3
  size 29264715
checkpoint-220/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 220,
3
- "best_metric": 1.4725391864776611,
4
  "best_model_checkpoint": "models/Llama-10M-1M/checkpoint-220",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
@@ -11,95 +11,95 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.18181818181818182,
14
- "grad_norm": 2.203366756439209,
15
  "learning_rate": 5.6999999999999996e-05,
16
- "loss": 5.382,
17
  "step": 20
18
  },
19
  {
20
  "epoch": 0.36363636363636365,
21
- "grad_norm": 2.104827880859375,
22
  "learning_rate": 0.000117,
23
- "loss": 4.7236,
24
  "step": 40
25
  },
26
  {
27
  "epoch": 0.5454545454545454,
28
- "grad_norm": 1.913203477859497,
29
  "learning_rate": 0.00017699999999999997,
30
- "loss": 3.7918,
31
  "step": 60
32
  },
33
  {
34
  "epoch": 0.7272727272727273,
35
- "grad_norm": 1.5432658195495605,
36
  "learning_rate": 0.000237,
37
- "loss": 2.9107,
38
  "step": 80
39
  },
40
  {
41
  "epoch": 0.9090909090909091,
42
- "grad_norm": 0.8627734184265137,
43
  "learning_rate": 0.00029699999999999996,
44
- "loss": 2.1595,
45
  "step": 100
46
  },
47
  {
48
  "epoch": 1.0,
49
- "eval_loss": 1.674540400505066,
50
- "eval_runtime": 0.5467,
51
- "eval_samples_per_second": 468.237,
52
- "eval_steps_per_second": 58.53,
53
  "step": 110
54
  },
55
  {
56
  "epoch": 1.0909090909090908,
57
- "grad_norm": 0.42695462703704834,
58
  "learning_rate": 0.00028182256689929475,
59
- "loss": 1.6818,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 1.2727272727272727,
64
- "grad_norm": 0.38224583864212036,
65
  "learning_rate": 0.0002283747847073923,
66
- "loss": 1.5296,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 1.4545454545454546,
71
- "grad_norm": 0.2458978146314621,
72
  "learning_rate": 0.00015392654224618098,
73
- "loss": 1.4866,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 1.6363636363636362,
78
- "grad_norm": 0.26635897159576416,
79
  "learning_rate": 7.842618596105872e-05,
80
- "loss": 1.4683,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 1.8181818181818183,
85
- "grad_norm": 0.2912762761116028,
86
  "learning_rate": 2.210397534688617e-05,
87
- "loss": 1.4608,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 2.0,
92
- "grad_norm": 0.27549538016319275,
93
  "learning_rate": 5.1401253666411016e-08,
94
- "loss": 1.4563,
95
  "step": 220
96
  },
97
  {
98
  "epoch": 2.0,
99
- "eval_loss": 1.4725391864776611,
100
- "eval_runtime": 0.5705,
101
- "eval_samples_per_second": 448.749,
102
- "eval_steps_per_second": 56.094,
103
  "step": 220
104
  }
105
  ],
 
1
  {
2
  "best_global_step": 220,
3
+ "best_metric": 1.4682797193527222,
4
  "best_model_checkpoint": "models/Llama-10M-1M/checkpoint-220",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.18181818181818182,
14
+ "grad_norm": 2.169536828994751,
15
  "learning_rate": 5.6999999999999996e-05,
16
+ "loss": 5.3417,
17
  "step": 20
18
  },
19
  {
20
  "epoch": 0.36363636363636365,
21
+ "grad_norm": 2.4265267848968506,
22
  "learning_rate": 0.000117,
23
+ "loss": 4.6389,
24
  "step": 40
25
  },
26
  {
27
  "epoch": 0.5454545454545454,
28
+ "grad_norm": 1.8873662948608398,
29
  "learning_rate": 0.00017699999999999997,
30
+ "loss": 3.6383,
31
  "step": 60
32
  },
33
  {
34
  "epoch": 0.7272727272727273,
35
+ "grad_norm": 1.449324607849121,
36
  "learning_rate": 0.000237,
37
+ "loss": 2.7798,
38
  "step": 80
39
  },
40
  {
41
  "epoch": 0.9090909090909091,
42
+ "grad_norm": 0.8489532470703125,
43
  "learning_rate": 0.00029699999999999996,
44
+ "loss": 2.0772,
45
  "step": 100
46
  },
47
  {
48
  "epoch": 1.0,
49
+ "eval_loss": 1.6459211111068726,
50
+ "eval_runtime": 0.5498,
51
+ "eval_samples_per_second": 465.608,
52
+ "eval_steps_per_second": 58.201,
53
  "step": 110
54
  },
55
  {
56
  "epoch": 1.0909090909090908,
57
+ "grad_norm": 0.400846004486084,
58
  "learning_rate": 0.00028182256689929475,
59
+ "loss": 1.65,
60
  "step": 120
61
  },
62
  {
63
  "epoch": 1.2727272727272727,
64
+ "grad_norm": 0.38010889291763306,
65
  "learning_rate": 0.0002283747847073923,
66
+ "loss": 1.518,
67
  "step": 140
68
  },
69
  {
70
  "epoch": 1.4545454545454546,
71
+ "grad_norm": 0.23362764716148376,
72
  "learning_rate": 0.00015392654224618098,
73
+ "loss": 1.4804,
74
  "step": 160
75
  },
76
  {
77
  "epoch": 1.6363636363636362,
78
+ "grad_norm": 0.27331477403640747,
79
  "learning_rate": 7.842618596105872e-05,
80
+ "loss": 1.4636,
81
  "step": 180
82
  },
83
  {
84
  "epoch": 1.8181818181818183,
85
+ "grad_norm": 0.2885988652706146,
86
  "learning_rate": 2.210397534688617e-05,
87
+ "loss": 1.4567,
88
  "step": 200
89
  },
90
  {
91
  "epoch": 2.0,
92
+ "grad_norm": 0.27050530910491943,
93
  "learning_rate": 5.1401253666411016e-08,
94
+ "loss": 1.4523,
95
  "step": 220
96
  },
97
  {
98
  "epoch": 2.0,
99
+ "eval_loss": 1.4682797193527222,
100
+ "eval_runtime": 0.6071,
101
+ "eval_samples_per_second": 421.665,
102
+ "eval_steps_per_second": 52.708,
103
  "step": 220
104
  }
105
  ],
checkpoint-220/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7081ab95bdd29f298ff46535770fb8f791491cc9135ee1f83aa9fcac31132803
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8b524ccc79f0e11610c634958c875c565569ce0ff90ffbb93a06434dc458fe
3
  size 5713
dataset_info.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "synthetic_babylm",
3
+ "type": "synthetic",
4
+ "description": "Synthetically generated text data in BabyLM style",
5
+ "size": {
6
+ "train_samples": 3519,
7
+ "eval_samples": 256,
8
+ "sequence_length": 128
9
+ },
10
+ "preprocessing": {
11
+ "tokenizer": "GPT2TokenizerFast",
12
+ "vocab_size": 288,
13
+ "special_tokens": [
14
+ "<s>",
15
+ "</s>",
16
+ "<pad>"
17
+ ]
18
+ }
19
+ }
evaluation_plots.png CHANGED

Git LFS Details

  • SHA256: 4920d9658bef69e263cf776fadd1fb8317a7558184b0f6cbb8d75a17afc6659b
  • Pointer size: 131 Bytes
  • Size of remote file: 117 kB

Git LFS Details

  • SHA256: 1b764ec245c47c6e41a540ea992df8bffc9ddc70a0de67cf43e1af5b369c2e5f
  • Pointer size: 131 Bytes
  • Size of remote file: 113 kB
evaluation_results.json CHANGED
@@ -1,28 +1,28 @@
1
  {
2
- "perplexity": 33.73914194244188,
3
- "average_loss": 3.518658645331541,
4
- "std_loss": 0.35455904870960114,
5
- "min_loss": 2.8167550563812256,
6
- "max_loss": 4.2027974128723145,
7
  "num_sequences": 100,
8
- "total_tokens": 2992,
9
- "avg_diversity_score": 0.9286521711438311,
10
- "avg_repetition_score": 0.5445736434108527,
11
  "generation_samples": [
12
- "A student discovers sadly in the classroom. A writer thinks creatively on the street. A child builds brilliantly at the office, furthermore a dog thinks slowly. A student reads carefully in the park. The teacher reads creatively outside. The scientist learns patiently on the",
13
- "The bird thinks sadly in the garden. The cat jumps thoughtfully on the street, afterwards the scientist jumps slowly. An artist teaches quickly at the library. A dog writes patiently on the street, consequently the engineer learns eagerly. The cat runs slowly on the",
14
- "A child discovers quickly outside. The teacher walks carefully in the park, however the teacher reads brilliantly. The cat creates slowly in the lab. The bird teaches thoughtfully in the lab, therefore a dog learns carefully. A child writes brilliantly on the street,",
15
- "An artist thinks carefully on the street, consequently an artist creates patiently. The teacher designs thoughtfully at the library, and a writer jumps eagerly. The engineer jumps creatively on the street, therefore the engineer learns creatively. The engineer creates brilliantly at home. An",
16
- "A child imagines brilliantly in the garden. A dog reads creatively in the school. The scientist explores happily outside. The teacher discovers creatively on the street, but the scientist walks happily. The bird imagines patiently in the classroom. The cat writes creatively on the",
17
- "The scientist designs happily at home, afterwards the cat jumps eagerly. A writer jumps happily at the library, but the bird runs creatively. The teacher reads quickly in the park. A child discovers brilliantly at home, however the cat builds happily. The teacher",
18
- "A writer discovers carefully outside. The scientist jumps sadly in the garden, afterwards the bird runs brilliantly. A student thinks slowly in the lab, moreover the cat writes thoughtfully. The scientist discovers quickly outside. The teacher walks brilliantly in the park, additionally the",
19
- "A student creates happily at the library, but a dog designs sadly. A writer writes thoughtfully in the park, furthermore a writer imagines happily. The cat jumps sadly in the classroom. The engineer runs sadly in the lab, additionally the cat explores quickly.",
20
- "The cat learns brilliantly in the classroom. The teacher builds thoughtfully at the office. A dog teaches thoughtfully in the classroom. The bird teaches slowly at the office. A dog learns quickly in the classroom. A student reads happily in the garden, moreover a",
21
- "A child reads brilliantly in the garden, and the cat creates quickly. The scientist reads carefully in the lab, but the bird runs happily. The scientist builds creatively in the garden. The cat builds eagerly in the garden, furthermore the scientist runs quickly."
22
  ],
23
- "avg_top_token_prob": 0.3424334205046762,
24
- "std_top_token_prob": 0.2698688641225257,
25
- "avg_entropy": 2.0452219695628933,
26
- "std_entropy": 0.9384522715673917,
27
- "low_confidence_ratio": 0.8365885416666666
28
  }
 
1
  {
2
+ "perplexity": 46.30131494735422,
3
+ "average_loss": 3.8351703612797032,
4
+ "std_loss": 0.3615157261181016,
5
+ "min_loss": 3.2384719848632812,
6
+ "max_loss": 4.606306076049805,
7
  "num_sequences": 100,
8
+ "total_tokens": 2907,
9
+ "avg_diversity_score": 0.9324846102931076,
10
+ "avg_repetition_score": 0.5276691331923891,
11
  "generation_samples": [
12
+ "A child teaches slowly at the office, therefore the teacher writes happily. The bird reads thoughtfully in the garden. An artist writes carefully outside, afterwards the engineer explores eagerly. A child walks quickly in the park, meanwhile a writer creates sadly. A student",
13
+ "The cat designs carefully at the library. A child jumps eagerly in the school, furthermore an artist learns thoughtfully. The engineer explores carefully in the school. The cat discovers eagerly on the street, and the scientist teaches quickly. The bird explores slowly in the",
14
+ "The scientist teaches quickly in the park, however the engineer imagines creatively. A child thinks sadly in the lab, however a writer walks carefully. A dog writes sadly at the office. A dog explores patiently in the classroom. The engineer creates sadly in the",
15
+ "A writer thinks sadly at the library. A writer reads carefully on the street, but the cat builds quickly. A student jumps patiently in the school. A student runs happily in the school, moreover a writer reads quickly. The cat creates brilliantly in the",
16
+ "The engineer learns creatively at the office, afterwards a student runs quickly. The teacher thinks creatively in the school, and the scientist creates patiently. The scientist writes brilliantly in the lab, therefore the scientist designs brilliantly. A writer imagines creatively in the school.",
17
+ "The scientist explores slowly on the street, furthermore the cat walks eagerly. A child thinks creatively at the library. A writer imagines sadly at home, additionally the teacher writes patiently. A dog builds creatively in the garden. The cat builds patiently at home,",
18
+ "The engineer designs brilliantly at the library. A student thinks brilliantly in the lab. The scientist builds creatively at home, furthermore the engineer jumps slowly. The scientist teaches brilliantly at home, additionally a child jumps quickly. The teacher teaches patiently at the library.",
19
+ "A child runs thoughtfully in the park, and the engineer reads eagerly. A writer discovers happily on the street. The teacher writes creatively in the park, therefore a child writes brilliantly. A student explores eagerly in the school. A writer runs eagerly in the",
20
+ "A writer builds slowly at home. A writer thinks carefully in the lab, and a dog teaches sadly. A writer imagines creatively at the library, however the engineer jumps quickly. An artist builds patiently in the garden. The bird builds sadly in the garden",
21
+ "The cat learns eagerly at home, afterwards the scientist teaches brilliantly. A writer learns brilliantly at the library, furthermore the teacher writes carefully. A student jumps carefully in the park, however the engineer imagines creatively. The cat jumps slowly in the garden, therefore"
22
  ],
23
+ "avg_top_token_prob": 0.35648854288045845,
24
+ "std_top_token_prob": 0.2770798175977247,
25
+ "avg_entropy": 2.0150103131619814,
26
+ "std_entropy": 0.9688909622718542,
27
+ "low_confidence_ratio": 0.7913413768630234
28
  }
generation_examples.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "examples": [
3
+ {
4
+ "id": 1,
5
+ "generated_text": "A student discovers sadly in the classroom. A writer thinks creatively on the street. A child builds brilliantly at the office, furthermore a dog thinks slowly. A student reads carefully in the park. The teacher reads creatively outside. The scientist learns patiently on the",
6
+ "method": "sampling",
7
+ "temperature": 0.8,
8
+ "top_p": 0.9
9
+ },
10
+ {
11
+ "id": 2,
12
+ "generated_text": "The bird thinks sadly in the garden. The cat jumps thoughtfully on the street, afterwards the scientist jumps slowly. An artist teaches quickly at the library. A dog writes patiently on the street, consequently the engineer learns eagerly. The cat runs slowly on the",
13
+ "method": "sampling",
14
+ "temperature": 0.8,
15
+ "top_p": 0.9
16
+ },
17
+ {
18
+ "id": 3,
19
+ "generated_text": "A child discovers quickly outside. The teacher walks carefully in the park, however the teacher reads brilliantly. The cat creates slowly in the lab. The bird teaches thoughtfully in the lab, therefore a dog learns carefully. A child writes brilliantly on the street,",
20
+ "method": "sampling",
21
+ "temperature": 0.8,
22
+ "top_p": 0.9
23
+ },
24
+ {
25
+ "id": 4,
26
+ "generated_text": "An artist thinks carefully on the street, consequently an artist creates patiently. The teacher designs thoughtfully at the library, and a writer jumps eagerly. The engineer jumps creatively on the street, therefore the engineer learns creatively. The engineer creates brilliantly at home. An",
27
+ "method": "sampling",
28
+ "temperature": 0.8,
29
+ "top_p": 0.9
30
+ },
31
+ {
32
+ "id": 5,
33
+ "generated_text": "A child imagines brilliantly in the garden. A dog reads creatively in the school. The scientist explores happily outside. The teacher discovers creatively on the street, but the scientist walks happily. The bird imagines patiently in the classroom. The cat writes creatively on the",
34
+ "method": "sampling",
35
+ "temperature": 0.8,
36
+ "top_p": 0.9
37
+ },
38
+ {
39
+ "id": 6,
40
+ "generated_text": "The scientist designs happily at home, afterwards the cat jumps eagerly. A writer jumps happily at the library, but the bird runs creatively. The teacher reads quickly in the park. A child discovers brilliantly at home, however the cat builds happily. The teacher",
41
+ "method": "sampling",
42
+ "temperature": 0.8,
43
+ "top_p": 0.9
44
+ },
45
+ {
46
+ "id": 7,
47
+ "generated_text": "A writer discovers carefully outside. The scientist jumps sadly in the garden, afterwards the bird runs brilliantly. A student thinks slowly in the lab, moreover the cat writes thoughtfully. The scientist discovers quickly outside. The teacher walks brilliantly in the park, additionally the",
48
+ "method": "sampling",
49
+ "temperature": 0.8,
50
+ "top_p": 0.9
51
+ },
52
+ {
53
+ "id": 8,
54
+ "generated_text": "A student creates happily at the library, but a dog designs sadly. A writer writes thoughtfully in the park, furthermore a writer imagines happily. The cat jumps sadly in the classroom. The engineer runs sadly in the lab, additionally the cat explores quickly.",
55
+ "method": "sampling",
56
+ "temperature": 0.8,
57
+ "top_p": 0.9
58
+ },
59
+ {
60
+ "id": 9,
61
+ "generated_text": "The cat learns brilliantly in the classroom. The teacher builds thoughtfully at the office. A dog teaches thoughtfully in the classroom. The bird teaches slowly at the office. A dog learns quickly in the classroom. A student reads happily in the garden, moreover a",
62
+ "method": "sampling",
63
+ "temperature": 0.8,
64
+ "top_p": 0.9
65
+ },
66
+ {
67
+ "id": 10,
68
+ "generated_text": "A child reads brilliantly in the garden, and the cat creates quickly. The scientist reads carefully in the lab, but the bird runs happily. The scientist builds creatively in the garden. The cat builds eagerly in the garden, furthermore the scientist runs quickly.",
69
+ "method": "sampling",
70
+ "temperature": 0.8,
71
+ "top_p": 0.9
72
+ }
73
+ ],
74
+ "generation_config": {
75
+ "temperature": 0.8,
76
+ "top_p": 0.9,
77
+ "max_new_tokens": 50,
78
+ "do_sample": true
79
+ }
80
+ }
metrics_summary.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "training_metrics": {
3
+ "loss": 2.5500883795998313,
4
+ "runtime_seconds": 26.0941,
5
+ "samples_per_second": 269.717,
6
+ "steps_per_second": 8.431
7
+ },
8
+ "evaluation_metrics": {
9
+ "perplexity": 33.73914194244188,
10
+ "average_loss": 3.518658645331541,
11
+ "diversity_score": 0.9286521711438311,
12
+ "repetition_score": 0.5445736434108527,
13
+ "confidence_score": 0.3424334205046762,
14
+ "entropy": 2.0452219695628933
15
+ }
16
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e25793471728cb6b312aba089827e86543414bf3d7255639385ac7e6ad26f313
3
  size 14614216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44c4152579055e34dc5ec45941f07908ba6328eb43a9851f23d71b427b97b242
3
  size 14614216
model_card_metadata.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets:
2
+ - synthetic
3
+ key_metrics:
4
+ diversity_score: 0.9286521711438311
5
+ perplexity: 33.73914194244188
6
+ training_loss: 2.5500883795998313
7
+ language: en
8
+ license: mit
9
+ metrics:
10
+ - perplexity
11
+ - loss
12
+ - diversity
13
+ model_name: Llama-10M-1M
14
+ model_size: 3652032
15
+ model_type: causal-lm
16
+ tags:
17
+ - text-generation
18
+ - pytorch
19
+ - causal-lm
20
+ - babylm
21
+ - small-language-model
22
+ training_data_size: 3519
model_info.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Llama-10M-1M",
3
+ "model_type": "causal-lm",
4
+ "architecture": "LLaMA",
5
+ "framework": "transformers",
6
+ "created_at": "2025-07-05T16:11:06.857492",
7
+ "parameters": {
8
+ "total": 3652032,
9
+ "hidden_size": 192,
10
+ "num_layers": 6,
11
+ "num_heads": 6,
12
+ "vocab_size": 288,
13
+ "sequence_length": 128
14
+ },
15
+ "training": {
16
+ "dataset_size": 3519,
17
+ "epochs": 2,
18
+ "batch_size": 32,
19
+ "learning_rate": "3e-4",
20
+ "training_time_seconds": 26.0941,
21
+ "final_loss": 2.5500883795998313
22
+ },
23
+ "evaluation": {
24
+ "perplexity": 33.73914194244188,
25
+ "diversity_score": 0.9286521711438311,
26
+ "repetition_score": 0.5445736434108527,
27
+ "top_token_confidence": 0.3424334205046762,
28
+ "entropy": 2.0452219695628933,
29
+ "num_eval_samples": 100
30
+ }
31
+ }
performance_benchmarks.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "text_generation": {
3
+ "perplexity": {
4
+ "value": 33.73914194244188,
5
+ "description": "Lower is better",
6
+ "benchmark_type": "intrinsic"
7
+ },
8
+ "diversity": {
9
+ "value": 0.9286521711438311,
10
+ "description": "Higher is better (0-1 scale)",
11
+ "benchmark_type": "quality"
12
+ },
13
+ "repetition": {
14
+ "value": 0.5445736434108527,
15
+ "description": "Lower is better (0-1 scale)",
16
+ "benchmark_type": "quality"
17
+ }
18
+ },
19
+ "efficiency": {
20
+ "parameters": 3652032,
21
+ "training_time": 26.0941,
22
+ "inference_speed": "Not measured"
23
+ }
24
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7081ab95bdd29f298ff46535770fb8f791491cc9135ee1f83aa9fcac31132803
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8b524ccc79f0e11610c634958c875c565569ce0ff90ffbb93a06434dc458fe
3
  size 5713
training_metrics.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "train_runtime": 26.0941,
3
- "train_samples_per_second": 269.717,
4
- "train_steps_per_second": 8.431,
5
  "total_flos": 19441019879424.0,
6
- "train_loss": 2.5500883795998313,
7
  "epoch": 2.0,
8
  "train_samples": 3519,
9
  "eval_samples": 256,
 
1
  {
2
+ "train_runtime": 29.3597,
3
+ "train_samples_per_second": 239.716,
4
+ "train_steps_per_second": 7.493,
5
  "total_flos": 19441019879424.0,
6
+ "train_loss": 2.499714469909668,
7
  "epoch": 2.0,
8
  "train_samples": 3519,
9
  "eval_samples": 256,