Upload Llama-10M-1M model
Browse files- README.md +40 -1
- checkpoint-220/model.safetensors +1 -1
- checkpoint-220/optimizer.pt +1 -1
- checkpoint-220/trainer_state.json +31 -31
- checkpoint-220/training_args.bin +1 -1
- dataset_info.json +19 -0
- evaluation_plots.png +2 -2
- evaluation_results.json +23 -23
- generation_examples.json +80 -0
- metrics_summary.json +16 -0
- model.safetensors +1 -1
- model_card_metadata.yaml +22 -0
- model_info.json +31 -0
- performance_benchmarks.json +24 -0
- training_args.bin +1 -1
- training_metrics.json +4 -4
README.md
CHANGED
|
@@ -21,6 +21,12 @@ model_index:
|
|
| 21 |
- type: perplexity
|
| 22 |
value: N/A
|
| 23 |
name: Perplexity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
---
|
| 25 |
|
| 26 |
# Llama-10M-1M
|
|
@@ -40,12 +46,25 @@ A 10M parameter LLaMA model trained on 1M synthetic tokens using the BabyLlama f
|
|
| 40 |
|
| 41 |
## Training Details
|
| 42 |
|
| 43 |
-
- **Training Loss**: 2.
|
| 44 |
- **Evaluation Loss**: N/A
|
| 45 |
- **Perplexity**: N/A
|
| 46 |
- **Learning Rate**: 3e-4
|
| 47 |
- **Batch Size**: 32
|
| 48 |
- **Epochs**: 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
## Usage
|
| 51 |
|
|
@@ -88,3 +107,23 @@ If you use this model in your research, please cite:
|
|
| 88 |
## License
|
| 89 |
|
| 90 |
This model is released under the MIT License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
- type: perplexity
|
| 22 |
value: N/A
|
| 23 |
name: Perplexity
|
| 24 |
+
- type: loss
|
| 25 |
+
value: 2.499714469909668
|
| 26 |
+
name: Training Loss
|
| 27 |
+
- type: loss
|
| 28 |
+
value: N/A
|
| 29 |
+
name: Evaluation Loss
|
| 30 |
---
|
| 31 |
|
| 32 |
# Llama-10M-1M
|
|
|
|
| 46 |
|
| 47 |
## Training Details
|
| 48 |
|
| 49 |
+
- **Training Loss**: 2.499714469909668
|
| 50 |
- **Evaluation Loss**: N/A
|
| 51 |
- **Perplexity**: N/A
|
| 52 |
- **Learning Rate**: 3e-4
|
| 53 |
- **Batch Size**: 32
|
| 54 |
- **Epochs**: 2
|
| 55 |
+
- **Training Time**: 29.3597 seconds
|
| 56 |
+
- **Training Samples**: 3,519
|
| 57 |
+
|
| 58 |
+
## Evaluation Metrics
|
| 59 |
+
|
| 60 |
+
| Metric | Value |
|
| 61 |
+
|--------|-------|
|
| 62 |
+
| Perplexity | N/A |
|
| 63 |
+
| Training Loss | 2.499714469909668 |
|
| 64 |
+
| Evaluation Loss | N/A |
|
| 65 |
+
| Training Time | 29.3597s |
|
| 66 |
+
| Parameters | 3,652,032 |
|
| 67 |
+
| Training Samples | 3,519 |
|
| 68 |
|
| 69 |
## Usage
|
| 70 |
|
|
|
|
| 107 |
## License
|
| 108 |
|
| 109 |
This model is released under the MIT License.
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
## Detailed Evaluation Results
|
| 113 |
+
|
| 114 |
+
### Generation Quality Metrics
|
| 115 |
+
- **Diversity Score**: 0.932
|
| 116 |
+
- **Repetition Score**: 0.528 (lower is better)
|
| 117 |
+
- **Average Top Token Probability**: 0.356
|
| 118 |
+
- **Average Entropy**: 2.015
|
| 119 |
+
- **Low Confidence Ratio**: 0.791
|
| 120 |
+
|
| 121 |
+
### Sample Generations
|
| 122 |
+
1. "A child teaches slowly at the office, therefore the teacher writes happily. The bird reads thoughtfully in the garden. An artist writes carefully outside, afterwards the engineer explores eagerly. A child walks quickly in the park, meanwhile a writer creates sadly. A student"
|
| 123 |
+
2. "The cat designs carefully at the library. A child jumps eagerly in the school, furthermore an artist learns thoughtfully. The engineer explores carefully in the school. The cat discovers eagerly on the street, and the scientist teaches quickly. The bird explores slowly in the"
|
| 124 |
+
3. "The scientist teaches quickly in the park, however the engineer imagines creatively. A child thinks sadly in the lab, however a writer walks carefully. A dog writes sadly at the office. A dog explores patiently in the classroom. The engineer creates sadly in the"
|
| 125 |
+
4. "A writer thinks sadly at the library. A writer reads carefully on the street, but the cat builds quickly. A student jumps patiently in the school. A student runs happily in the school, moreover a writer reads quickly. The cat creates brilliantly in the"
|
| 126 |
+
5. "The engineer learns creatively at the office, afterwards a student runs quickly. The teacher thinks creatively in the school, and the scientist creates patiently. The scientist writes brilliantly in the lab, therefore the scientist designs brilliantly. A writer imagines creatively in the school."
|
| 127 |
+
|
| 128 |
+
### Evaluation Plots
|
| 129 |
+

|
checkpoint-220/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14614216
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44c4152579055e34dc5ec45941f07908ba6328eb43a9851f23d71b427b97b242
|
| 3 |
size 14614216
|
checkpoint-220/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 29264715
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae9e8f0c14808224676da1acdb2933f7088e30c97b867502269734e5fa06a9bc
|
| 3 |
size 29264715
|
checkpoint-220/trainer_state.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"best_global_step": 220,
|
| 3 |
-
"best_metric": 1.
|
| 4 |
"best_model_checkpoint": "models/Llama-10M-1M/checkpoint-220",
|
| 5 |
"epoch": 2.0,
|
| 6 |
"eval_steps": 500,
|
|
@@ -11,95 +11,95 @@
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.18181818181818182,
|
| 14 |
-
"grad_norm": 2.
|
| 15 |
"learning_rate": 5.6999999999999996e-05,
|
| 16 |
-
"loss": 5.
|
| 17 |
"step": 20
|
| 18 |
},
|
| 19 |
{
|
| 20 |
"epoch": 0.36363636363636365,
|
| 21 |
-
"grad_norm": 2.
|
| 22 |
"learning_rate": 0.000117,
|
| 23 |
-
"loss": 4.
|
| 24 |
"step": 40
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"epoch": 0.5454545454545454,
|
| 28 |
-
"grad_norm": 1.
|
| 29 |
"learning_rate": 0.00017699999999999997,
|
| 30 |
-
"loss": 3.
|
| 31 |
"step": 60
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"epoch": 0.7272727272727273,
|
| 35 |
-
"grad_norm": 1.
|
| 36 |
"learning_rate": 0.000237,
|
| 37 |
-
"loss": 2.
|
| 38 |
"step": 80
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"epoch": 0.9090909090909091,
|
| 42 |
-
"grad_norm": 0.
|
| 43 |
"learning_rate": 0.00029699999999999996,
|
| 44 |
-
"loss": 2.
|
| 45 |
"step": 100
|
| 46 |
},
|
| 47 |
{
|
| 48 |
"epoch": 1.0,
|
| 49 |
-
"eval_loss": 1.
|
| 50 |
-
"eval_runtime": 0.
|
| 51 |
-
"eval_samples_per_second":
|
| 52 |
-
"eval_steps_per_second": 58.
|
| 53 |
"step": 110
|
| 54 |
},
|
| 55 |
{
|
| 56 |
"epoch": 1.0909090909090908,
|
| 57 |
-
"grad_norm": 0.
|
| 58 |
"learning_rate": 0.00028182256689929475,
|
| 59 |
-
"loss": 1.
|
| 60 |
"step": 120
|
| 61 |
},
|
| 62 |
{
|
| 63 |
"epoch": 1.2727272727272727,
|
| 64 |
-
"grad_norm": 0.
|
| 65 |
"learning_rate": 0.0002283747847073923,
|
| 66 |
-
"loss": 1.
|
| 67 |
"step": 140
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"epoch": 1.4545454545454546,
|
| 71 |
-
"grad_norm": 0.
|
| 72 |
"learning_rate": 0.00015392654224618098,
|
| 73 |
-
"loss": 1.
|
| 74 |
"step": 160
|
| 75 |
},
|
| 76 |
{
|
| 77 |
"epoch": 1.6363636363636362,
|
| 78 |
-
"grad_norm": 0.
|
| 79 |
"learning_rate": 7.842618596105872e-05,
|
| 80 |
-
"loss": 1.
|
| 81 |
"step": 180
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"epoch": 1.8181818181818183,
|
| 85 |
-
"grad_norm": 0.
|
| 86 |
"learning_rate": 2.210397534688617e-05,
|
| 87 |
-
"loss": 1.
|
| 88 |
"step": 200
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"epoch": 2.0,
|
| 92 |
-
"grad_norm": 0.
|
| 93 |
"learning_rate": 5.1401253666411016e-08,
|
| 94 |
-
"loss": 1.
|
| 95 |
"step": 220
|
| 96 |
},
|
| 97 |
{
|
| 98 |
"epoch": 2.0,
|
| 99 |
-
"eval_loss": 1.
|
| 100 |
-
"eval_runtime": 0.
|
| 101 |
-
"eval_samples_per_second":
|
| 102 |
-
"eval_steps_per_second":
|
| 103 |
"step": 220
|
| 104 |
}
|
| 105 |
],
|
|
|
|
| 1 |
{
|
| 2 |
"best_global_step": 220,
|
| 3 |
+
"best_metric": 1.4682797193527222,
|
| 4 |
"best_model_checkpoint": "models/Llama-10M-1M/checkpoint-220",
|
| 5 |
"epoch": 2.0,
|
| 6 |
"eval_steps": 500,
|
|
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.18181818181818182,
|
| 14 |
+
"grad_norm": 2.169536828994751,
|
| 15 |
"learning_rate": 5.6999999999999996e-05,
|
| 16 |
+
"loss": 5.3417,
|
| 17 |
"step": 20
|
| 18 |
},
|
| 19 |
{
|
| 20 |
"epoch": 0.36363636363636365,
|
| 21 |
+
"grad_norm": 2.4265267848968506,
|
| 22 |
"learning_rate": 0.000117,
|
| 23 |
+
"loss": 4.6389,
|
| 24 |
"step": 40
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"epoch": 0.5454545454545454,
|
| 28 |
+
"grad_norm": 1.8873662948608398,
|
| 29 |
"learning_rate": 0.00017699999999999997,
|
| 30 |
+
"loss": 3.6383,
|
| 31 |
"step": 60
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"epoch": 0.7272727272727273,
|
| 35 |
+
"grad_norm": 1.449324607849121,
|
| 36 |
"learning_rate": 0.000237,
|
| 37 |
+
"loss": 2.7798,
|
| 38 |
"step": 80
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"epoch": 0.9090909090909091,
|
| 42 |
+
"grad_norm": 0.8489532470703125,
|
| 43 |
"learning_rate": 0.00029699999999999996,
|
| 44 |
+
"loss": 2.0772,
|
| 45 |
"step": 100
|
| 46 |
},
|
| 47 |
{
|
| 48 |
"epoch": 1.0,
|
| 49 |
+
"eval_loss": 1.6459211111068726,
|
| 50 |
+
"eval_runtime": 0.5498,
|
| 51 |
+
"eval_samples_per_second": 465.608,
|
| 52 |
+
"eval_steps_per_second": 58.201,
|
| 53 |
"step": 110
|
| 54 |
},
|
| 55 |
{
|
| 56 |
"epoch": 1.0909090909090908,
|
| 57 |
+
"grad_norm": 0.400846004486084,
|
| 58 |
"learning_rate": 0.00028182256689929475,
|
| 59 |
+
"loss": 1.65,
|
| 60 |
"step": 120
|
| 61 |
},
|
| 62 |
{
|
| 63 |
"epoch": 1.2727272727272727,
|
| 64 |
+
"grad_norm": 0.38010889291763306,
|
| 65 |
"learning_rate": 0.0002283747847073923,
|
| 66 |
+
"loss": 1.518,
|
| 67 |
"step": 140
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"epoch": 1.4545454545454546,
|
| 71 |
+
"grad_norm": 0.23362764716148376,
|
| 72 |
"learning_rate": 0.00015392654224618098,
|
| 73 |
+
"loss": 1.4804,
|
| 74 |
"step": 160
|
| 75 |
},
|
| 76 |
{
|
| 77 |
"epoch": 1.6363636363636362,
|
| 78 |
+
"grad_norm": 0.27331477403640747,
|
| 79 |
"learning_rate": 7.842618596105872e-05,
|
| 80 |
+
"loss": 1.4636,
|
| 81 |
"step": 180
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"epoch": 1.8181818181818183,
|
| 85 |
+
"grad_norm": 0.2885988652706146,
|
| 86 |
"learning_rate": 2.210397534688617e-05,
|
| 87 |
+
"loss": 1.4567,
|
| 88 |
"step": 200
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"epoch": 2.0,
|
| 92 |
+
"grad_norm": 0.27050530910491943,
|
| 93 |
"learning_rate": 5.1401253666411016e-08,
|
| 94 |
+
"loss": 1.4523,
|
| 95 |
"step": 220
|
| 96 |
},
|
| 97 |
{
|
| 98 |
"epoch": 2.0,
|
| 99 |
+
"eval_loss": 1.4682797193527222,
|
| 100 |
+
"eval_runtime": 0.6071,
|
| 101 |
+
"eval_samples_per_second": 421.665,
|
| 102 |
+
"eval_steps_per_second": 52.708,
|
| 103 |
"step": 220
|
| 104 |
}
|
| 105 |
],
|
checkpoint-220/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5713
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b8b524ccc79f0e11610c634958c875c565569ce0ff90ffbb93a06434dc458fe
|
| 3 |
size 5713
|
dataset_info.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "synthetic_babylm",
|
| 3 |
+
"type": "synthetic",
|
| 4 |
+
"description": "Synthetically generated text data in BabyLM style",
|
| 5 |
+
"size": {
|
| 6 |
+
"train_samples": 3519,
|
| 7 |
+
"eval_samples": 256,
|
| 8 |
+
"sequence_length": 128
|
| 9 |
+
},
|
| 10 |
+
"preprocessing": {
|
| 11 |
+
"tokenizer": "GPT2TokenizerFast",
|
| 12 |
+
"vocab_size": 288,
|
| 13 |
+
"special_tokens": [
|
| 14 |
+
"<s>",
|
| 15 |
+
"</s>",
|
| 16 |
+
"<pad>"
|
| 17 |
+
]
|
| 18 |
+
}
|
| 19 |
+
}
|
evaluation_plots.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
evaluation_results.json
CHANGED
|
@@ -1,28 +1,28 @@
|
|
| 1 |
{
|
| 2 |
-
"perplexity":
|
| 3 |
-
"average_loss": 3.
|
| 4 |
-
"std_loss": 0.
|
| 5 |
-
"min_loss":
|
| 6 |
-
"max_loss": 4.
|
| 7 |
"num_sequences": 100,
|
| 8 |
-
"total_tokens":
|
| 9 |
-
"avg_diversity_score": 0.
|
| 10 |
-
"avg_repetition_score": 0.
|
| 11 |
"generation_samples": [
|
| 12 |
-
"A
|
| 13 |
-
"The
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
-
"
|
| 17 |
-
"The scientist
|
| 18 |
-
"
|
| 19 |
-
"A
|
| 20 |
-
"
|
| 21 |
-
"
|
| 22 |
],
|
| 23 |
-
"avg_top_token_prob": 0.
|
| 24 |
-
"std_top_token_prob": 0.
|
| 25 |
-
"avg_entropy": 2.
|
| 26 |
-
"std_entropy": 0.
|
| 27 |
-
"low_confidence_ratio": 0.
|
| 28 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"perplexity": 46.30131494735422,
|
| 3 |
+
"average_loss": 3.8351703612797032,
|
| 4 |
+
"std_loss": 0.3615157261181016,
|
| 5 |
+
"min_loss": 3.2384719848632812,
|
| 6 |
+
"max_loss": 4.606306076049805,
|
| 7 |
"num_sequences": 100,
|
| 8 |
+
"total_tokens": 2907,
|
| 9 |
+
"avg_diversity_score": 0.9324846102931076,
|
| 10 |
+
"avg_repetition_score": 0.5276691331923891,
|
| 11 |
"generation_samples": [
|
| 12 |
+
"A child teaches slowly at the office, therefore the teacher writes happily. The bird reads thoughtfully in the garden. An artist writes carefully outside, afterwards the engineer explores eagerly. A child walks quickly in the park, meanwhile a writer creates sadly. A student",
|
| 13 |
+
"The cat designs carefully at the library. A child jumps eagerly in the school, furthermore an artist learns thoughtfully. The engineer explores carefully in the school. The cat discovers eagerly on the street, and the scientist teaches quickly. The bird explores slowly in the",
|
| 14 |
+
"The scientist teaches quickly in the park, however the engineer imagines creatively. A child thinks sadly in the lab, however a writer walks carefully. A dog writes sadly at the office. A dog explores patiently in the classroom. The engineer creates sadly in the",
|
| 15 |
+
"A writer thinks sadly at the library. A writer reads carefully on the street, but the cat builds quickly. A student jumps patiently in the school. A student runs happily in the school, moreover a writer reads quickly. The cat creates brilliantly in the",
|
| 16 |
+
"The engineer learns creatively at the office, afterwards a student runs quickly. The teacher thinks creatively in the school, and the scientist creates patiently. The scientist writes brilliantly in the lab, therefore the scientist designs brilliantly. A writer imagines creatively in the school.",
|
| 17 |
+
"The scientist explores slowly on the street, furthermore the cat walks eagerly. A child thinks creatively at the library. A writer imagines sadly at home, additionally the teacher writes patiently. A dog builds creatively in the garden. The cat builds patiently at home,",
|
| 18 |
+
"The engineer designs brilliantly at the library. A student thinks brilliantly in the lab. The scientist builds creatively at home, furthermore the engineer jumps slowly. The scientist teaches brilliantly at home, additionally a child jumps quickly. The teacher teaches patiently at the library.",
|
| 19 |
+
"A child runs thoughtfully in the park, and the engineer reads eagerly. A writer discovers happily on the street. The teacher writes creatively in the park, therefore a child writes brilliantly. A student explores eagerly in the school. A writer runs eagerly in the",
|
| 20 |
+
"A writer builds slowly at home. A writer thinks carefully in the lab, and a dog teaches sadly. A writer imagines creatively at the library, however the engineer jumps quickly. An artist builds patiently in the garden. The bird builds sadly in the garden",
|
| 21 |
+
"The cat learns eagerly at home, afterwards the scientist teaches brilliantly. A writer learns brilliantly at the library, furthermore the teacher writes carefully. A student jumps carefully in the park, however the engineer imagines creatively. The cat jumps slowly in the garden, therefore"
|
| 22 |
],
|
| 23 |
+
"avg_top_token_prob": 0.35648854288045845,
|
| 24 |
+
"std_top_token_prob": 0.2770798175977247,
|
| 25 |
+
"avg_entropy": 2.0150103131619814,
|
| 26 |
+
"std_entropy": 0.9688909622718542,
|
| 27 |
+
"low_confidence_ratio": 0.7913413768630234
|
| 28 |
}
|
generation_examples.json
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"examples": [
|
| 3 |
+
{
|
| 4 |
+
"id": 1,
|
| 5 |
+
"generated_text": "A student discovers sadly in the classroom. A writer thinks creatively on the street. A child builds brilliantly at the office, furthermore a dog thinks slowly. A student reads carefully in the park. The teacher reads creatively outside. The scientist learns patiently on the",
|
| 6 |
+
"method": "sampling",
|
| 7 |
+
"temperature": 0.8,
|
| 8 |
+
"top_p": 0.9
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"id": 2,
|
| 12 |
+
"generated_text": "The bird thinks sadly in the garden. The cat jumps thoughtfully on the street, afterwards the scientist jumps slowly. An artist teaches quickly at the library. A dog writes patiently on the street, consequently the engineer learns eagerly. The cat runs slowly on the",
|
| 13 |
+
"method": "sampling",
|
| 14 |
+
"temperature": 0.8,
|
| 15 |
+
"top_p": 0.9
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"id": 3,
|
| 19 |
+
"generated_text": "A child discovers quickly outside. The teacher walks carefully in the park, however the teacher reads brilliantly. The cat creates slowly in the lab. The bird teaches thoughtfully in the lab, therefore a dog learns carefully. A child writes brilliantly on the street,",
|
| 20 |
+
"method": "sampling",
|
| 21 |
+
"temperature": 0.8,
|
| 22 |
+
"top_p": 0.9
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"id": 4,
|
| 26 |
+
"generated_text": "An artist thinks carefully on the street, consequently an artist creates patiently. The teacher designs thoughtfully at the library, and a writer jumps eagerly. The engineer jumps creatively on the street, therefore the engineer learns creatively. The engineer creates brilliantly at home. An",
|
| 27 |
+
"method": "sampling",
|
| 28 |
+
"temperature": 0.8,
|
| 29 |
+
"top_p": 0.9
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"id": 5,
|
| 33 |
+
"generated_text": "A child imagines brilliantly in the garden. A dog reads creatively in the school. The scientist explores happily outside. The teacher discovers creatively on the street, but the scientist walks happily. The bird imagines patiently in the classroom. The cat writes creatively on the",
|
| 34 |
+
"method": "sampling",
|
| 35 |
+
"temperature": 0.8,
|
| 36 |
+
"top_p": 0.9
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"id": 6,
|
| 40 |
+
"generated_text": "The scientist designs happily at home, afterwards the cat jumps eagerly. A writer jumps happily at the library, but the bird runs creatively. The teacher reads quickly in the park. A child discovers brilliantly at home, however the cat builds happily. The teacher",
|
| 41 |
+
"method": "sampling",
|
| 42 |
+
"temperature": 0.8,
|
| 43 |
+
"top_p": 0.9
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"id": 7,
|
| 47 |
+
"generated_text": "A writer discovers carefully outside. The scientist jumps sadly in the garden, afterwards the bird runs brilliantly. A student thinks slowly in the lab, moreover the cat writes thoughtfully. The scientist discovers quickly outside. The teacher walks brilliantly in the park, additionally the",
|
| 48 |
+
"method": "sampling",
|
| 49 |
+
"temperature": 0.8,
|
| 50 |
+
"top_p": 0.9
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": 8,
|
| 54 |
+
"generated_text": "A student creates happily at the library, but a dog designs sadly. A writer writes thoughtfully in the park, furthermore a writer imagines happily. The cat jumps sadly in the classroom. The engineer runs sadly in the lab, additionally the cat explores quickly.",
|
| 55 |
+
"method": "sampling",
|
| 56 |
+
"temperature": 0.8,
|
| 57 |
+
"top_p": 0.9
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"id": 9,
|
| 61 |
+
"generated_text": "The cat learns brilliantly in the classroom. The teacher builds thoughtfully at the office. A dog teaches thoughtfully in the classroom. The bird teaches slowly at the office. A dog learns quickly in the classroom. A student reads happily in the garden, moreover a",
|
| 62 |
+
"method": "sampling",
|
| 63 |
+
"temperature": 0.8,
|
| 64 |
+
"top_p": 0.9
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"id": 10,
|
| 68 |
+
"generated_text": "A child reads brilliantly in the garden, and the cat creates quickly. The scientist reads carefully in the lab, but the bird runs happily. The scientist builds creatively in the garden. The cat builds eagerly in the garden, furthermore the scientist runs quickly.",
|
| 69 |
+
"method": "sampling",
|
| 70 |
+
"temperature": 0.8,
|
| 71 |
+
"top_p": 0.9
|
| 72 |
+
}
|
| 73 |
+
],
|
| 74 |
+
"generation_config": {
|
| 75 |
+
"temperature": 0.8,
|
| 76 |
+
"top_p": 0.9,
|
| 77 |
+
"max_new_tokens": 50,
|
| 78 |
+
"do_sample": true
|
| 79 |
+
}
|
| 80 |
+
}
|
metrics_summary.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"training_metrics": {
|
| 3 |
+
"loss": 2.5500883795998313,
|
| 4 |
+
"runtime_seconds": 26.0941,
|
| 5 |
+
"samples_per_second": 269.717,
|
| 6 |
+
"steps_per_second": 8.431
|
| 7 |
+
},
|
| 8 |
+
"evaluation_metrics": {
|
| 9 |
+
"perplexity": 33.73914194244188,
|
| 10 |
+
"average_loss": 3.518658645331541,
|
| 11 |
+
"diversity_score": 0.9286521711438311,
|
| 12 |
+
"repetition_score": 0.5445736434108527,
|
| 13 |
+
"confidence_score": 0.3424334205046762,
|
| 14 |
+
"entropy": 2.0452219695628933
|
| 15 |
+
}
|
| 16 |
+
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14614216
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44c4152579055e34dc5ec45941f07908ba6328eb43a9851f23d71b427b97b242
|
| 3 |
size 14614216
|
model_card_metadata.yaml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets:
|
| 2 |
+
- synthetic
|
| 3 |
+
key_metrics:
|
| 4 |
+
diversity_score: 0.9286521711438311
|
| 5 |
+
perplexity: 33.73914194244188
|
| 6 |
+
training_loss: 2.5500883795998313
|
| 7 |
+
language: en
|
| 8 |
+
license: mit
|
| 9 |
+
metrics:
|
| 10 |
+
- perplexity
|
| 11 |
+
- loss
|
| 12 |
+
- diversity
|
| 13 |
+
model_name: Llama-10M-1M
|
| 14 |
+
model_size: 3652032
|
| 15 |
+
model_type: causal-lm
|
| 16 |
+
tags:
|
| 17 |
+
- text-generation
|
| 18 |
+
- pytorch
|
| 19 |
+
- causal-lm
|
| 20 |
+
- babylm
|
| 21 |
+
- small-language-model
|
| 22 |
+
training_data_size: 3519
|
model_info.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Llama-10M-1M",
|
| 3 |
+
"model_type": "causal-lm",
|
| 4 |
+
"architecture": "LLaMA",
|
| 5 |
+
"framework": "transformers",
|
| 6 |
+
"created_at": "2025-07-05T16:11:06.857492",
|
| 7 |
+
"parameters": {
|
| 8 |
+
"total": 3652032,
|
| 9 |
+
"hidden_size": 192,
|
| 10 |
+
"num_layers": 6,
|
| 11 |
+
"num_heads": 6,
|
| 12 |
+
"vocab_size": 288,
|
| 13 |
+
"sequence_length": 128
|
| 14 |
+
},
|
| 15 |
+
"training": {
|
| 16 |
+
"dataset_size": 3519,
|
| 17 |
+
"epochs": 2,
|
| 18 |
+
"batch_size": 32,
|
| 19 |
+
"learning_rate": "3e-4",
|
| 20 |
+
"training_time_seconds": 26.0941,
|
| 21 |
+
"final_loss": 2.5500883795998313
|
| 22 |
+
},
|
| 23 |
+
"evaluation": {
|
| 24 |
+
"perplexity": 33.73914194244188,
|
| 25 |
+
"diversity_score": 0.9286521711438311,
|
| 26 |
+
"repetition_score": 0.5445736434108527,
|
| 27 |
+
"top_token_confidence": 0.3424334205046762,
|
| 28 |
+
"entropy": 2.0452219695628933,
|
| 29 |
+
"num_eval_samples": 100
|
| 30 |
+
}
|
| 31 |
+
}
|
performance_benchmarks.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"text_generation": {
|
| 3 |
+
"perplexity": {
|
| 4 |
+
"value": 33.73914194244188,
|
| 5 |
+
"description": "Lower is better",
|
| 6 |
+
"benchmark_type": "intrinsic"
|
| 7 |
+
},
|
| 8 |
+
"diversity": {
|
| 9 |
+
"value": 0.9286521711438311,
|
| 10 |
+
"description": "Higher is better (0-1 scale)",
|
| 11 |
+
"benchmark_type": "quality"
|
| 12 |
+
},
|
| 13 |
+
"repetition": {
|
| 14 |
+
"value": 0.5445736434108527,
|
| 15 |
+
"description": "Lower is better (0-1 scale)",
|
| 16 |
+
"benchmark_type": "quality"
|
| 17 |
+
}
|
| 18 |
+
},
|
| 19 |
+
"efficiency": {
|
| 20 |
+
"parameters": 3652032,
|
| 21 |
+
"training_time": 26.0941,
|
| 22 |
+
"inference_speed": "Not measured"
|
| 23 |
+
}
|
| 24 |
+
}
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5713
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b8b524ccc79f0e11610c634958c875c565569ce0ff90ffbb93a06434dc458fe
|
| 3 |
size 5713
|
training_metrics.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
-
"train_runtime":
|
| 3 |
-
"train_samples_per_second":
|
| 4 |
-
"train_steps_per_second":
|
| 5 |
"total_flos": 19441019879424.0,
|
| 6 |
-
"train_loss": 2.
|
| 7 |
"epoch": 2.0,
|
| 8 |
"train_samples": 3519,
|
| 9 |
"eval_samples": 256,
|
|
|
|
| 1 |
{
|
| 2 |
+
"train_runtime": 29.3597,
|
| 3 |
+
"train_samples_per_second": 239.716,
|
| 4 |
+
"train_steps_per_second": 7.493,
|
| 5 |
"total_flos": 19441019879424.0,
|
| 6 |
+
"train_loss": 2.499714469909668,
|
| 7 |
"epoch": 2.0,
|
| 8 |
"train_samples": 3519,
|
| 9 |
"eval_samples": 256,
|