Upload folder using huggingface_hub
Browse files
logs/output_run_20260201_haznadar_carapine.log
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-01 17:51:49,092 - root - INFO - Run: run_20260201_haznadar_carapine
|
| 2 |
+
2026-02-01 17:51:49,092 - root - INFO - Log directory: /root/tiny_moe/training_runs/Tiny_MoE/logs
|
| 3 |
+
2026-02-01 17:51:49,092 - root - INFO - Output dir: /root/tiny_moe/training_runs
|
| 4 |
+
2026-02-01 17:51:51,385 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
|
| 5 |
+
2026-02-01 17:51:56,983 - root - INFO - Flax version: 0.11.1
|
| 6 |
+
2026-02-01 17:51:56,983 - root - INFO - Optax version: 0.2.6
|
| 7 |
+
2026-02-01 17:51:56,983 - root - INFO - Platform: gpu
|
| 8 |
+
2026-02-01 17:51:56,983 - root - INFO - Num Devices: 8
|
| 9 |
+
2026-02-01 17:51:56,983 - root - INFO - Devices: [CudaDevice(id=0), CudaDevice(id=1), CudaDevice(id=2), CudaDevice(id=3), CudaDevice(id=4), CudaDevice(id=5), CudaDevice(id=6), CudaDevice(id=7)]
|
| 10 |
+
2026-02-01 17:51:57,743 - root - INFO - Model config:
|
| 11 |
+
Config(name='Tiny_MoE',
|
| 12 |
+
dtype=<class 'jax.numpy.bfloat16'>,
|
| 13 |
+
vocab_size=50304,
|
| 14 |
+
block_size=2048,
|
| 15 |
+
n_layer=30,
|
| 16 |
+
n_embed=672,
|
| 17 |
+
n_glu_hidden=2048,
|
| 18 |
+
n_head=12,
|
| 19 |
+
n_kv_head=4,
|
| 20 |
+
n_experts=8,
|
| 21 |
+
init_stddev=0.02,
|
| 22 |
+
expert_load_factor=1.25,
|
| 23 |
+
aux_loss_coeff=0.01,
|
| 24 |
+
moe_bias=True,
|
| 25 |
+
mlp_bias=False,
|
| 26 |
+
attention_bias=False,
|
| 27 |
+
load_balance_loss_coeff=0.01,
|
| 28 |
+
z_loss_coeff=0.0005,
|
| 29 |
+
expert_top_k=2,
|
| 30 |
+
ln_epsilon=1e-05,
|
| 31 |
+
rope_theta=0.0001,
|
| 32 |
+
expert_partition_spec=PartitionSpec('devices',),
|
| 33 |
+
sdpa_implementation='cudnn',
|
| 34 |
+
value_residual_init=0.5,
|
| 35 |
+
logit_softcap=30.0)
|
| 36 |
+
2026-02-01 17:53:20,889 - root - INFO - Parameter Count: 1,062,182,190
|
| 37 |
+
2026-02-01 17:53:20,890 - root - INFO - Sharded / MoE Parameter Count: 992,210,160
|
| 38 |
+
2026-02-01 17:53:20,890 - root - INFO - Replicated Parameter Count: 69,972,030
|
| 39 |
+
2026-02-01 17:53:22,108 - root - INFO - Weight decay param count: 1,062,140,928
|
| 40 |
+
2026-02-01 17:53:22,108 - root - INFO - Training config:
|
| 41 |
+
TrainerConfig(num_tokens=100000000000,
|
| 42 |
+
num_tokens_per_batch=262144,
|
| 43 |
+
mB=128,
|
| 44 |
+
T=2048,
|
| 45 |
+
max_steps=381469,
|
| 46 |
+
max_lr=0.004,
|
| 47 |
+
min_lr=0.0004,
|
| 48 |
+
max_grad_norm=1.0,
|
| 49 |
+
weight_decay=0.1,
|
| 50 |
+
adam_b1=0.9,
|
| 51 |
+
adam_b2=0.95,
|
| 52 |
+
warmup_steps=3814,
|
| 53 |
+
print_interval=100,
|
| 54 |
+
val=True,
|
| 55 |
+
val_interval=5000,
|
| 56 |
+
val_batches=50,
|
| 57 |
+
checkpoint_model=False,
|
| 58 |
+
checkpoint_optimizer=False,
|
| 59 |
+
checkpoint_interval=10000)
|
| 60 |
+
2026-02-01 17:53:22,108 - root - INFO - Effective batch size per device: 16
|
| 61 |
+
2026-02-01 17:53:25,460 - root - INFO - ModdedNanoGPTDataLoader: 1030 shards (train)
|
| 62 |
+
2026-02-01 17:53:25,547 - root - INFO - HuggingfaceDataLoader initialized:
|
| 63 |
+
------------------------
|
| 64 |
+
label: train
|
| 65 |
+
shards: 1,030
|
| 66 |
+
shard size: 100,000,000
|
| 67 |
+
batch size: 128
|
| 68 |
+
block size: 2048
|
| 69 |
+
device rank: 1
|
| 70 |
+
start shard: 0
|
| 71 |
+
start pos: 0
|
| 72 |
+
------------------------
|
| 73 |
+
2026-02-01 17:53:25,548 - root - INFO - ModdedNanoGPTDataLoader: 1 shards (val)
|
| 74 |
+
2026-02-01 17:53:25,631 - root - INFO - Starting from step: 0
|
| 75 |
+
2026-02-01 17:54:32,813 - root - INFO - 0 | lr: 0.0000 | loss: 13.1395 | logits loss: 12.7500 | load balance loss: 30.1163 | z loss: 146.0000 | avg iter time: 0.00ms | avg tok/sec: 0.00 | tokens processed: 262,144
|
| 76 |
+
2026-02-01 17:57:03,446 - root - INFO - 100 | lr: 0.0001 | loss: 7.2181 | logits loss: 6.9062 | load balance loss: 30.6685 | z loss: 23.8750 | avg iter time: 1498.84ms | avg tok/sec: 174,898.18 | tokens processed: 26,476,544
|
| 77 |
+
2026-02-01 17:58:34,580 - root - INFO - 200 | lr: 0.0002 | loss: 6.2478 | logits loss: 5.9375 | load balance loss: 30.2957 | z loss: 11.1250 | avg iter time: 903.81ms | avg tok/sec: 290,041.69 | tokens processed: 52,690,944
|
| 78 |
+
2026-02-01 18:00:05,688 - root - INFO - 300 | lr: 0.0003 | loss: 5.7342 | logits loss: 5.4375 | load balance loss: 30.4150 | z loss: 11.2500 | avg iter time: 903.62ms | avg tok/sec: 290,103.45 | tokens processed: 78,905,344
|
| 79 |
+
2026-02-01 18:01:36,952 - root - INFO - 400 | lr: 0.0004 | loss: 5.4742 | logits loss: 5.1562 | load balance loss: 30.1245 | z loss: 5.9688 | avg iter time: 905.18ms | avg tok/sec: 289,605.87 | tokens processed: 105,119,744
|
| 80 |
+
2026-02-01 18:03:08,123 - root - INFO - 500 | lr: 0.0005 | loss: 5.1571 | logits loss: 4.8438 | load balance loss: 30.2382 | z loss: 4.5938 | avg iter time: 904.24ms | avg tok/sec: 289,904.36 | tokens processed: 131,334,144
|
| 81 |
+
2026-02-01 18:04:38,980 - root - INFO - 600 | lr: 0.0006 | loss: 4.9084 | logits loss: 4.5938 | load balance loss: 30.2275 | z loss: 3.0781 | avg iter time: 903.31ms | avg tok/sec: 290,202.34 | tokens processed: 157,548,544
|
| 82 |
+
2026-02-01 18:06:09,889 - root - INFO - 700 | lr: 0.0007 | loss: 4.7807 | logits loss: 4.4688 | load balance loss: 30.1966 | z loss: 2.8281 | avg iter time: 901.71ms | avg tok/sec: 290,717.74 | tokens processed: 183,762,944
|
| 83 |
+
2026-02-01 18:06:15,869 - root - WARNING - Received KeyboardInterrupt. Exiting...
|
| 84 |
+
2026-02-01 18:06:16,124 - root - INFO - Training completed.
|
logs/run_20260201_haznadar_carapine_train.csv
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
step,lr,loss,load_balance_loss,z_loss,time,tokens_processed,tokens_per_sec
|
| 2 |
+
0,1.0487677e-06,13.139485359191895,30.116331100463867,146.0,0,262144,0
|
| 3 |
+
100,0.00010592554,7.218147277832031,30.66851806640625,23.875,1498.8377809524536,26476544,174898.17999744948
|
| 4 |
+
200,0.0002108023,6.247823238372803,30.295677185058594,11.125,903.8148856163025,52690944,290041.69346165017
|
| 5 |
+
300,0.00031567906,5.734152317047119,30.415040969848633,11.25,903.6224913597107,78905344,290103.44751992973
|
| 6 |
+
400,0.00042055585,5.474191188812256,30.124540328979492,5.96875,905.1750254631042,105119744,289605.8691697579
|
| 7 |
+
500,0.00052543264,5.1571149826049805,30.238168716430664,4.59375,904.2430472373962,131334144,289904.3579056437
|
| 8 |
+
600,0.0006303094,4.908387184143066,30.227489471435547,3.078125,903.3145570755005,157548544,290202.3419712139
|
| 9 |
+
700,0.00073518616,4.780745029449463,30.196577072143555,2.828125,901.7131185531616,183762944,290717.7400508729
|
logs/run_20260201_haznadar_carapine_train.png
ADDED
|
logs/run_20260201_haznadar_carapine_val.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
step,loss,logits_loss
|