narySt commited on
Commit
b1445cc
·
verified ·
1 Parent(s): ee015c9

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/.hydra/config.yaml +49 -0
  3. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/.hydra/hydra.yaml +160 -0
  4. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/.hydra/overrides.yaml +1 -0
  5. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_latest.pt +3 -0
  6. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_18000.pt +3 -0
  7. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_19774.pt +3 -0
  8. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_24000.pt +3 -0
  9. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_29661.pt +3 -0
  10. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_6000.pt +3 -0
  11. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/.ipynb_checkpoints/summary-checkpoint.txt +16 -0
  12. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/eval_config.yaml +31 -0
  13. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_latest.txt +17 -0
  14. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_12000.txt +17 -0
  15. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_18000.txt +17 -0
  16. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_19774.txt +17 -0
  17. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_24000.txt +17 -0
  18. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_29661.txt +17 -0
  19. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_6000.txt +17 -0
  20. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_9887.txt +17 -0
  21. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_initial_checkpoint.txt +17 -0
  22. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_model_best.txt +17 -0
  23. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_model_final.txt +17 -0
  24. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_latest.txt +0 -0
  25. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_12000.txt +0 -0
  26. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_18000.txt +0 -0
  27. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_19774.txt +0 -0
  28. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_24000.txt +0 -0
  29. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_29661.txt +0 -0
  30. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_6000.txt +0 -0
  31. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_9887.txt +0 -0
  32. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_initial_checkpoint.txt +0 -0
  33. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_model_best.txt +0 -0
  34. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_model_final.txt +0 -0
  35. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/summary.txt +16 -0
  36. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/model_best.pt +3 -0
  37. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/model_final.pt +3 -0
  38. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/train.log +0 -0
  39. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/debug-internal.log +15 -0
  40. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/debug.log +24 -0
  41. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/code/code_completion_exp/train_pythia/train.py +606 -0
  42. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/config.yaml +154 -0
  43. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/output.log +0 -0
  44. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/requirements.txt +245 -0
  45. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/wandb-metadata.json +1 -0
  46. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/wandb-summary.json +1 -0
  47. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug-core.log +16 -0
  48. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug-internal.log +15 -0
  49. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug.log +24 -0
  50. pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/run-jhqe4qjw.wandb +3 -0
.gitattributes CHANGED
@@ -46,3 +46,4 @@ lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/run-d5usyud5.wa
46
  lr_sweep/pythia_1b_lr_1e-4/wandb/run-20260425_201333-p8ozhgpm/run-p8ozhgpm.wandb filter=lfs diff=lfs merge=lfs -text
47
  lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/run-ln6tfunh.wandb filter=lfs diff=lfs merge=lfs -text
48
  lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/run-khn25dwv.wandb filter=lfs diff=lfs merge=lfs -text
 
 
46
  lr_sweep/pythia_1b_lr_1e-4/wandb/run-20260425_201333-p8ozhgpm/run-p8ozhgpm.wandb filter=lfs diff=lfs merge=lfs -text
47
  lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/run-ln6tfunh.wandb filter=lfs diff=lfs merge=lfs -text
48
  lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/run-khn25dwv.wandb filter=lfs diff=lfs merge=lfs -text
49
+ pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/run-jhqe4qjw.wandb filter=lfs diff=lfs merge=lfs -text
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/.hydra/config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: EleutherAI/pythia-1.4b
3
+ checkpoint_path: null
4
+ from_scratch: false
5
+ training:
6
+ epochs: 3
7
+ batch_size: 4
8
+ eval_batch_size: 12
9
+ gradient_accumulation_steps: 4
10
+ lr: 2.0e-05
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ max_grad_norm: 1.0
22
+ use_amp: true
23
+ resume: false
24
+ resume_checkpoint: null
25
+ data:
26
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
27
+ max_context_len: 4096
28
+ max_target_len: 256
29
+ num_workers: 4
30
+ pin_memory: true
31
+ max_train_samples: null
32
+ max_val_samples: null
33
+ logging:
34
+ log_interval: 10
35
+ save_interval: 6000
36
+ eval_interval: 1000
37
+ save_every_epoch: true
38
+ tracking:
39
+ enabled: true
40
+ backend: wandb
41
+ project: code-completion_pythia-1.4b-rerun
42
+ run_name: pythia_1_4b_lr_2e-5
43
+ entity: null
44
+ base_url: https://wandb.platun0v.ru
45
+ local_dir: ${paths.output_dir}
46
+ paths:
47
+ output_dir: outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5
48
+ seed: 42
49
+ device: cuda
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/.hydra/hydra.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: false
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: config
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.3'
132
+ cwd: /workspace/byte-llms-code/code_completion_exp/train_pythia
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/byte-llms-code/code_completion_exp/train_pythia/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/byte-llms-code/code_completion_exp/train_pythia/outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5
144
+ choices:
145
+ paths: default
146
+ tracking: wandb
147
+ logging: default
148
+ data: default
149
+ training: default
150
+ model: pythia_1_4b
151
+ hydra/env: default
152
+ hydra/callbacks: null
153
+ hydra/job_logging: default
154
+ hydra/hydra_logging: default
155
+ hydra/hydra_help: default
156
+ hydra/help: default
157
+ hydra/sweeper: basic
158
+ hydra/launcher: basic
159
+ hydra/output: default
160
+ verbose: false
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_latest.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619099feb68a23252ee1d9d123dc5e80777d59a517d2a1277e16b11080ff840d
3
+ size 8488268654
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_18000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e3195e5d668f03e57524372992bcc7ad140118ed7201e0616464b3ee3a851e
3
+ size 8488273342
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_19774.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5cda16b18df8729407d83bd272355d7d621d43adccbbf1b5c643452c224677f
3
+ size 8488273342
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_24000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:832b215b90a6d53e33a0548db8f8eabc715240f2f1f78fdf7e5885e8b469462a
3
+ size 8488273342
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_29661.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2619e2e59f770c5a99cfb4b12e9b25be6042452912ebc49a6f8e366641bac9bc
3
+ size 8488273342
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/checkpoints/checkpoint_step_6000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49c762d0ff93a76e555f3cb446b85b195af76e053559fbe8fc21b7e2e5b7d297
3
+ size 8488272170
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/.ipynb_checkpoints/summary-checkpoint.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EVALUATION SUMMARY
2
+ ==================================================================================================
3
+
4
+ Checkpoint Exact Match Token Acc BLEU PERPLEXITY ms/sample samp/s
5
+ --------------------------------------------------------------------------------------------------
6
+ initial_checkpoint 0.00% 28.41% 1.10 537.69 33.0 30.34
7
+ checkpoint_latest 34.91% 36.09% 19.27 318.72 13.5 74.14
8
+ checkpoint_step_12000 33.88% 35.22% 18.91 291.07 13.3 75.07
9
+ checkpoint_step_18000 34.98% 36.06% 19.55 297.06 13.5 74.32
10
+ checkpoint_step_19774 35.04% 36.08% 19.47 297.00 13.3 74.97
11
+ checkpoint_step_24000 34.82% 36.00% 19.17 313.80 13.6 73.32
12
+ checkpoint_step_29661 34.81% 36.03% 19.32 318.72 13.5 74.20
13
+ checkpoint_step_6000 32.96% 34.80% 18.37 289.31 13.5 74.03
14
+ checkpoint_step_9887 33.98% 35.17% 18.42 268.97 13.6 73.46
15
+ model_best 35.00% 36.02% 19.48 294.19 13.3 74.91
16
+ model_final 34.89% 36.07% 19.29 318.72 13.5 74.09
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/eval_config.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
3
+ max_context_len: 4096
4
+ max_target_len: 256
5
+ num_workers: 4
6
+ pin_memory: true
7
+ max_train_samples: null
8
+ max_val_samples: null
9
+ model:
10
+ name: EleutherAI/pythia-1.4b
11
+ checkpoint_path: null
12
+ from_scratch: false
13
+ paths:
14
+ checkpoints_dir: outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5
15
+ initial_checkpoint: auto
16
+ output_dir: outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results
17
+ evaluation:
18
+ batch_size: 16
19
+ max_samples: null
20
+ compute_perplexity: true
21
+ bleu_tokenize: none
22
+ save_predictions: true
23
+ use_amp: true
24
+ generation:
25
+ max_new_tokens: 64
26
+ temperature: 0.1
27
+ top_k: 0
28
+ top_p: 1.0
29
+ do_sample: true
30
+ seed: 42
31
+ device: cuda
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_latest.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_latest.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3491082112940908
5
+ token_accuracy: 0.3609102339784317
6
+ bleu: 19.269757348478024
7
+ perplexity: 318.723480592771
8
+ num_samples: 35098
9
+ gen_wall_time_s: 473.41876831650734
10
+ gen_samples_per_s: 74.13732270228668
11
+ gen_time_per_sample_ms: 13.48848277156839
12
+ gen_chars_per_s: 1919.0747405954103
13
+ gen_batch_mean_ms: 417.0968242689166
14
+ gen_batch_p50_ms: 357.9665273427963
15
+ gen_batch_p95_ms: 846.3990613818174
16
+ gen_batch_max_ms: 3587.848901748657
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_12000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_12000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.33879423328964614
5
+ token_accuracy: 0.3521884334688284
6
+ bleu: 18.905961886825054
7
+ perplexity: 291.0708305846913
8
+ num_samples: 35098
9
+ gen_wall_time_s: 467.52853602170944
10
+ gen_samples_per_s: 75.07135350208921
11
+ gen_time_per_sample_ms: 13.320660323144038
12
+ gen_chars_per_s: 1932.4296388147054
13
+ gen_batch_mean_ms: 416.61676505721476
14
+ gen_batch_p50_ms: 356.2297746539116
15
+ gen_batch_p95_ms: 898.9157944917679
16
+ gen_batch_max_ms: 3586.8209674954414
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_18000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_18000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.34982050259274033
5
+ token_accuracy: 0.3606308170284673
6
+ bleu: 19.550134976587582
7
+ perplexity: 297.0567970765463
8
+ num_samples: 35098
9
+ gen_wall_time_s: 472.2754218801856
10
+ gen_samples_per_s: 74.31680408070065
11
+ gen_time_per_sample_ms: 13.455906942851035
12
+ gen_chars_per_s: 1911.1178735635738
13
+ gen_batch_mean_ms: 411.898644021292
14
+ gen_batch_p50_ms: 349.4953215122223
15
+ gen_batch_p95_ms: 808.1516057252898
16
+ gen_batch_max_ms: 3586.3386392593384
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_19774.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_19774.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3504188272836059
5
+ token_accuracy: 0.3608470325254635
6
+ bleu: 19.467009802659206
7
+ perplexity: 296.99746886537366
8
+ num_samples: 35098
9
+ gen_wall_time_s: 468.18471866846085
10
+ gen_samples_per_s: 74.96613751046883
11
+ gen_time_per_sample_ms: 13.33935605072827
12
+ gen_chars_per_s: 1926.4063179274315
13
+ gen_batch_mean_ms: 415.29658690108533
14
+ gen_batch_p50_ms: 353.5846024751663
15
+ gen_batch_p95_ms: 864.5775735378279
16
+ gen_batch_max_ms: 3583.6338102817535
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_24000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_24000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.34822497008376546
5
+ token_accuracy: 0.35996553857617103
6
+ bleu: 19.169750619802162
7
+ perplexity: 313.79867716386906
8
+ num_samples: 35098
9
+ gen_wall_time_s: 478.67056403309107
10
+ gen_samples_per_s: 73.32391552193636
11
+ gen_time_per_sample_ms: 13.63811510721668
12
+ gen_chars_per_s: 1899.4065403552715
13
+ gen_batch_mean_ms: 415.3155261547107
14
+ gen_batch_p50_ms: 358.4626540541649
15
+ gen_batch_p95_ms: 833.3663418889046
16
+ gen_batch_max_ms: 3585.671544075012
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_29661.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_29661.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3480540201720896
5
+ token_accuracy: 0.3602582610951814
6
+ bleu: 19.317169450326166
7
+ perplexity: 318.723480592771
8
+ num_samples: 35098
9
+ gen_wall_time_s: 472.98777021467686
10
+ gen_samples_per_s: 74.20487845609608
11
+ gen_time_per_sample_ms: 13.476202923661658
12
+ gen_chars_per_s: 1923.102577445405
13
+ gen_batch_mean_ms: 412.44772313639805
14
+ gen_batch_p50_ms: 353.74465584754944
15
+ gen_batch_p95_ms: 786.1013039946557
16
+ gen_batch_max_ms: 3586.092635989189
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_6000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_6000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3296484130149866
5
+ token_accuracy: 0.34800383200388524
6
+ bleu: 18.371122658286087
7
+ perplexity: 289.313993391532
8
+ num_samples: 35098
9
+ gen_wall_time_s: 474.1001447662711
10
+ gen_samples_per_s: 74.03077258561719
11
+ gen_time_per_sample_ms: 13.507896312219247
12
+ gen_chars_per_s: 1912.9207405053733
13
+ gen_batch_mean_ms: 418.14548702895263
14
+ gen_batch_p50_ms: 347.5266695022583
15
+ gen_batch_p95_ms: 911.2774878740311
16
+ gen_batch_max_ms: 2318.4452056884766
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_checkpoint_step_9887.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_9887.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3398199327597014
5
+ token_accuracy: 0.3516528843147299
6
+ bleu: 18.415097468391597
7
+ perplexity: 268.97285058053575
8
+ num_samples: 35098
9
+ gen_wall_time_s: 477.7747625410557
10
+ gen_samples_per_s: 73.46139384451892
11
+ gen_time_per_sample_ms: 13.61259224289292
12
+ gen_chars_per_s: 1895.455915635939
13
+ gen_batch_mean_ms: 433.3023355837399
14
+ gen_batch_p50_ms: 356.7342534661293
15
+ gen_batch_p95_ms: 910.7078820466995
16
+ gen_batch_max_ms: 3595.522381365299
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_initial_checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: initial_checkpoint
2
+ ================================================================================
3
+
4
+ exact_match: 0.0
5
+ token_accuracy: 0.2841204686221418
6
+ bleu: 1.1002232570475041
7
+ perplexity: 537.6947490846742
8
+ num_samples: 35098
9
+ gen_wall_time_s: 1157.0034177526832
10
+ gen_samples_per_s: 30.33526043351967
11
+ gen_time_per_sample_ms: 32.96493867891854
12
+ gen_chars_per_s: 6686.615511496883
13
+ gen_batch_mean_ms: 1047.8846134453224
14
+ gen_batch_p50_ms: 919.267863035202
15
+ gen_batch_p95_ms: 1635.0471004843712
16
+ gen_batch_max_ms: 3586.4143297076225
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_model_best.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_best.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.34996296085247025
5
+ token_accuracy: 0.36019173324995174
6
+ bleu: 19.47680016321815
7
+ perplexity: 294.1937527039324
8
+ num_samples: 35098
9
+ gen_wall_time_s: 468.51309882104397
10
+ gen_samples_per_s: 74.91359385323449
11
+ gen_time_per_sample_ms: 13.348712143741636
12
+ gen_chars_per_s: 1920.1085354149618
13
+ gen_batch_mean_ms: 414.4478002894218
14
+ gen_batch_p50_ms: 353.60363125801086
15
+ gen_batch_p95_ms: 818.3927312493347
16
+ gen_batch_max_ms: 3586.1896350979805
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/metrics_model_final.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_final.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.348880278078523
5
+ token_accuracy: 0.36068736569691245
6
+ bleu: 19.290218865092186
7
+ perplexity: 318.723480592771
8
+ num_samples: 35098
9
+ gen_wall_time_s: 473.6957869678736
10
+ gen_samples_per_s: 74.09396698387857
11
+ gen_time_per_sample_ms: 13.496375490565661
12
+ gen_chars_per_s: 1917.0193718898054
13
+ gen_batch_mean_ms: 417.4575515994726
14
+ gen_batch_p50_ms: 359.1250032186508
15
+ gen_batch_p95_ms: 807.6998949050917
16
+ gen_batch_max_ms: 3586.7618024349213
17
+ gen_num_batches: 1097
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_latest.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_12000.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_18000.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_19774.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_24000.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_29661.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_6000.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_checkpoint_step_9887.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_initial_checkpoint.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_model_best.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/predictions_model_final.txt ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/eval_results/summary.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EVALUATION SUMMARY
2
+ ==================================================================================================
3
+
4
+ Checkpoint Exact Match Token Acc BLEU PERPLEXITY ms/sample samp/s
5
+ --------------------------------------------------------------------------------------------------
6
+ initial_checkpoint 0.00% 28.41% 1.10 537.69 33.0 30.34
7
+ checkpoint_latest 34.91% 36.09% 19.27 318.72 13.5 74.14
8
+ checkpoint_step_12000 33.88% 35.22% 18.91 291.07 13.3 75.07
9
+ checkpoint_step_18000 34.98% 36.06% 19.55 297.06 13.5 74.32
10
+ checkpoint_step_19774 35.04% 36.08% 19.47 297.00 13.3 74.97
11
+ checkpoint_step_24000 34.82% 36.00% 19.17 313.80 13.6 73.32
12
+ checkpoint_step_29661 34.81% 36.03% 19.32 318.72 13.5 74.20
13
+ checkpoint_step_6000 32.96% 34.80% 18.37 289.31 13.5 74.03
14
+ checkpoint_step_9887 33.98% 35.17% 18.42 268.97 13.6 73.46
15
+ model_best 35.00% 36.02% 19.48 294.19 13.3 74.91
16
+ model_final 34.89% 36.07% 19.29 318.72 13.5 74.09
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30322219874e8a6f3736e61b133825d39f1e585a64560fe6e27084f2ff995e06
3
+ size 2829410658
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a446b0ff6ad1f48b5c1989291f67e33fdb25f00ed0575315930f468fc6f4f881
3
+ size 2829410954
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/train.log ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T21:56:04.515355828Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-04-25T21:56:04.914734923Z","level":"INFO","msg":"stream: created new stream","id":"jhqe4qjw"}
3
+ {"time":"2026-04-25T21:56:04.91479529Z","level":"INFO","msg":"handler: started","stream_id":"jhqe4qjw"}
4
+ {"time":"2026-04-25T21:56:04.914888584Z","level":"INFO","msg":"stream: started","id":"jhqe4qjw"}
5
+ {"time":"2026-04-25T21:56:04.914893063Z","level":"INFO","msg":"writer: started","stream_id":"jhqe4qjw"}
6
+ {"time":"2026-04-25T21:56:04.914903421Z","level":"INFO","msg":"sender: started","stream_id":"jhqe4qjw"}
7
+ {"time":"2026-04-25T21:56:05.060789967Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-04-25T23:20:15.280635585Z","level":"ERROR","msg":"api: HTTP error","status":403,"method":"POST","url":"https://wandb.platun0v.ru/files/nikita/code-completion_pythia-1.4b-rerun/jhqe4qjw/file_stream"}
9
+ {"time":"2026-04-25T23:20:15.280696408Z","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 403 Forbidden url=https://wandb.platun0v.ru/files/nikita/code-completion_pythia-1.4b-rerun/jhqe4qjw/file_stream: "}
10
+ {"time":"2026-04-26T01:44:27.121098166Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2026-04-26T01:44:27.121923843Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2026-04-26T01:44:27.124737334Z","level":"INFO","msg":"stream: closing","id":"jhqe4qjw"}
13
+ {"time":"2026-04-26T01:44:27.124749132Z","level":"INFO","msg":"handler: closed","stream_id":"jhqe4qjw"}
14
+ {"time":"2026-04-26T01:44:27.124842763Z","level":"INFO","msg":"sender: closed","stream_id":"jhqe4qjw"}
15
+ {"time":"2026-04-26T01:44:27.124848143Z","level":"INFO","msg":"stream: closed","id":"jhqe4qjw"}
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-25 21:56:04,224 INFO MainThread:185429 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-04-25 21:56:04,224 INFO MainThread:185429 [wandb_setup.py:_flush():81] Configure stats pid to 185429
3
+ 2026-04-25 21:56:04,224 INFO MainThread:185429 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug.log
5
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug-internal.log
6
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'name': 'EleutherAI/pythia-1.4b', 'checkpoint_path': None, 'from_scratch': False}, 'training': {'epochs': 3, 'batch_size': 4, 'eval_batch_size': 12, 'gradient_accumulation_steps': 4, 'lr': 2e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 4, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 6000, 'eval_interval': 1000, 'save_every_epoch': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_pythia-1.4b-rerun', 'run_name': 'pythia_1_4b_lr_2e-5', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5'}, 'paths': {'output_dir': 'outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_pythia/train.py'}}
9
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:init():892] starting backend
10
+ 2026-04-25 21:56:04,496 INFO MainThread:185429 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-25 21:56:04,514 INFO MainThread:185429 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-25 21:56:04,517 INFO MainThread:185429 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-25 21:56:04,531 INFO MainThread:185429 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-25 21:56:05,060 INFO MainThread:185429 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-04-25 21:56:05,220 INFO MainThread:185429 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-04-26 01:44:26,160 INFO MainThread:185429 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_pythia-1.4b-rerun/jhqe4qjw
21
+ 2026-04-26 01:44:26,160 INFO MainThread:185429 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-04-26 01:44:26,161 INFO MainThread:185429 [wandb_run.py:_restore():2476] restore
23
+ 2026-04-26 01:44:26,161 INFO MainThread:185429 [wandb_run.py:_restore():2482] restore done
24
+ 2026-04-26 01:44:27,124 INFO MainThread:185429 [wandb_run.py:_footer_sync_info():3870] logging synced files
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/code/code_completion_exp/train_pythia/train.py ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline для Pythia (decoder-only transformer) на задаче Code Completion.
3
+
4
+ Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
+ Поддержка DDP через Accelerate для multi-GPU тренировки.
6
+
7
+ Использование:
8
+ # Базовый запуск (single GPU)
9
+ python train.py
10
+
11
+ # Multi-GPU с Accelerate
12
+ accelerate launch train.py
13
+
14
+ # Multi-GPU с указанием количества GPU
15
+ accelerate launch --num_processes=4 train.py
16
+
17
+ # Переопределение параметров через CLI
18
+ python train.py training.lr=1e-4 training.epochs=5
19
+
20
+ # Выбор другого конфига модели
21
+ python train.py model=pythia_160m
22
+
23
+ # Multirun (sweep)
24
+ python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
+
26
+ # Без логирования
27
+ python train.py tracking.enabled=false
28
+ """
29
+
30
+ import os
31
+ import math
32
+ import time
33
+ from pathlib import Path
34
+
35
+ import torch
36
+ import torch.nn as nn
37
+ import torch.nn.functional as F
38
+ from torch.utils.data import DataLoader
39
+ from datasets import load_from_disk
40
+
41
+ import hydra
42
+ from hydra.core.hydra_config import HydraConfig
43
+ from omegaconf import DictConfig, OmegaConf
44
+ from transformers import (
45
+ AutoTokenizer,
46
+ AutoModelForCausalLM,
47
+ AutoConfig,
48
+ PreTrainedTokenizerBase,
49
+ )
50
+ from accelerate import Accelerator
51
+ from accelerate.utils import set_seed as accelerate_set_seed
52
+
53
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
54
+ import sys
55
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
56
+
57
+ # Shared training library
58
+ from training_lib.utils import AverageMeter, log_message
59
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
60
+ from training_lib.schedulers import get_lr_scheduler
61
+ from training_lib.tracking import init_tracking, log_metrics, finish_tracking
62
+ from training_lib.validation import run_validation
63
+
64
+
65
+ # ============================================================================
66
+ # ДАННЫЕ
67
+ # ============================================================================
68
+
69
+
70
+ class CodeCompletionCollator:
71
+ """Collate function для батчирования примеров code completion."""
72
+
73
+ def __init__(
74
+ self,
75
+ tokenizer: PreTrainedTokenizerBase,
76
+ max_context_len: int = 1024,
77
+ max_target_len: int = 256,
78
+ ):
79
+ self.tokenizer = tokenizer
80
+ self.max_context_len = max_context_len
81
+ self.max_target_len = max_target_len
82
+ self.pad_token_id = tokenizer.pad_token_id
83
+
84
+ def __call__(self, batch: list[dict]) -> dict:
85
+ contexts = [item["context"] for item in batch]
86
+ targets = [item["target"] for item in batch]
87
+
88
+ encoded_contexts = self.tokenizer(
89
+ contexts,
90
+ add_special_tokens=True,
91
+ truncation=True,
92
+ max_length=self.max_context_len,
93
+ return_tensors=None,
94
+ )
95
+ encoded_targets = self.tokenizer(
96
+ targets,
97
+ add_special_tokens=False,
98
+ truncation=True,
99
+ max_length=self.max_target_len,
100
+ return_tensors=None,
101
+ )
102
+
103
+ input_ids_list = []
104
+ context_lengths = []
105
+
106
+ for ctx_ids, tgt_ids in zip(
107
+ encoded_contexts["input_ids"], encoded_targets["input_ids"]
108
+ ):
109
+ tgt_ids = tgt_ids + [self.tokenizer.eos_token_id]
110
+ context_lengths.append(len(ctx_ids))
111
+ input_ids_list.append(ctx_ids + tgt_ids)
112
+
113
+ max_len = max(len(ids) for ids in input_ids_list)
114
+
115
+ padded_input_ids = []
116
+ attention_mask = []
117
+
118
+ for ids in input_ids_list:
119
+ padding_len = max_len - len(ids)
120
+ padded_input_ids.append(ids + [self.pad_token_id] * padding_len)
121
+ attention_mask.append([1] * len(ids) + [0] * padding_len)
122
+
123
+ return {
124
+ "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
125
+ "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
126
+ "context_lengths": torch.tensor(context_lengths, dtype=torch.long),
127
+ }
128
+
129
+
130
+ def create_dataloaders(
131
+ cfg: DictConfig, tokenizer: PreTrainedTokenizerBase
132
+ ) -> dict[str, DataLoader]:
133
+ """Создание DataLoader'ов для train и validation."""
134
+ dataset_dict = load_from_disk(cfg.data.path)
135
+
136
+ collator = CodeCompletionCollator(
137
+ tokenizer=tokenizer,
138
+ max_context_len=cfg.data.max_context_len,
139
+ max_target_len=cfg.data.max_target_len,
140
+ )
141
+
142
+ dataloaders = {}
143
+
144
+ if "train" in dataset_dict:
145
+ train_dataset = dataset_dict["train"]
146
+ max_train = cfg.data.get("max_train_samples", None)
147
+ if max_train is not None:
148
+ train_dataset = train_dataset.select(range(min(max_train, len(train_dataset))))
149
+ dataloaders["train"] = DataLoader(
150
+ train_dataset,
151
+ batch_size=cfg.training.batch_size,
152
+ shuffle=True,
153
+ collate_fn=collator,
154
+ num_workers=cfg.data.num_workers,
155
+ pin_memory=cfg.data.pin_memory,
156
+ )
157
+
158
+ if "validation" in dataset_dict:
159
+ val_dataset = dataset_dict["validation"]
160
+ max_val = cfg.data.get("max_val_samples", None)
161
+ if max_val is not None:
162
+ val_dataset = val_dataset.select(range(min(max_val, len(val_dataset))))
163
+ eval_batch_size = cfg.training.get("eval_batch_size", cfg.training.batch_size)
164
+ dataloaders["validation"] = DataLoader(
165
+ val_dataset,
166
+ batch_size=eval_batch_size,
167
+ shuffle=False,
168
+ collate_fn=collator,
169
+ num_workers=cfg.data.num_workers,
170
+ pin_memory=cfg.data.pin_memory,
171
+ )
172
+
173
+ return dataloaders
174
+
175
+
176
+
177
+
178
+ # ============================================================================
179
+ # LOSS ФУНКЦИИ
180
+ # ============================================================================
181
+
182
+
183
+ def compute_loss(
184
+ logits: torch.Tensor,
185
+ input_ids: torch.Tensor,
186
+ context_lengths: torch.Tensor,
187
+ attention_mask: torch.Tensor,
188
+ ) -> dict:
189
+ """Вычисление loss для авторегрессионной модели."""
190
+ batch_size, seq_len, vocab_size = logits.shape
191
+
192
+ shift_logits = logits[:, :-1, :].contiguous()
193
+ shift_labels = input_ids[:, 1:].contiguous()
194
+ shift_mask = attention_mask[:, 1:].contiguous()
195
+
196
+ target_mask = torch.zeros_like(shift_labels, dtype=torch.bool)
197
+ for i in range(batch_size):
198
+ ctx_len = context_lengths[i].item()
199
+ target_mask[i, ctx_len - 1 :] = True
200
+
201
+ final_mask = target_mask & shift_mask.bool()
202
+
203
+ if final_mask.sum() > 0:
204
+ loss = F.cross_entropy(
205
+ shift_logits[final_mask], shift_labels[final_mask], reduction="mean"
206
+ )
207
+ else:
208
+ loss = torch.tensor(0.0, device=logits.device)
209
+
210
+ return {"loss": loss}
211
+
212
+
213
+ def _pythia_forward_loss(
214
+ model: nn.Module,
215
+ batch: dict,
216
+ cfg: DictConfig,
217
+ accelerator: Accelerator,
218
+ ) -> dict:
219
+ """Forward + loss for a plain HF causal LM (attention_mask= kwarg, .logits)."""
220
+ input_ids = batch["input_ids"]
221
+ attention_mask = batch["attention_mask"]
222
+ context_lengths = batch["context_lengths"]
223
+ output = model(input_ids, attention_mask=attention_mask)
224
+ return compute_loss(output.logits, input_ids, context_lengths, attention_mask)
225
+
226
+
227
+ # ============================================================================
228
+ # PARAMETER GROUPING
229
+ # ============================================================================
230
+
231
+
232
+ def group_params(model: nn.Module, weight_decay: float) -> list[dict]:
233
+ """Группировка параметров для optimizer."""
234
+ decay_params = []
235
+ no_decay_params = []
236
+
237
+ for name, param in model.named_parameters():
238
+ if not param.requires_grad:
239
+ continue
240
+
241
+ if "bias" in name or "LayerNorm" in name or "layernorm" in name:
242
+ no_decay_params.append(param)
243
+ else:
244
+ decay_params.append(param)
245
+
246
+ return [
247
+ {"params": decay_params, "weight_decay": weight_decay},
248
+ {"params": no_decay_params, "weight_decay": 0.0},
249
+ ]
250
+
251
+
252
+
253
+
254
+ # ============================================================================
255
+ # TRAINING LOOP
256
+ # ============================================================================
257
+
258
+
259
+ def train_epoch(
260
+ model: nn.Module,
261
+ dataloader: DataLoader,
262
+ optimizer: torch.optim.Optimizer,
263
+ scheduler,
264
+ cfg: DictConfig,
265
+ epoch: int,
266
+ global_step: int,
267
+ accelerator: Accelerator,
268
+ val_dataloader: DataLoader | None = None,
269
+ best_val_loss: float = float("inf"),
270
+ ) -> tuple[int, float]:
271
+ """Один epoch тренировки. Возвращает (global_step, best_val_loss)."""
272
+ model.train()
273
+
274
+ loss_meter = AverageMeter()
275
+
276
+ optimizer.zero_grad()
277
+ accumulated_loss = 0.0
278
+ accumulated_steps = 0
279
+
280
+ epoch_start_time = time.time()
281
+ step_start_time = time.time()
282
+
283
+ for batch_idx, batch in enumerate(dataloader):
284
+ input_ids = batch["input_ids"]
285
+ attention_mask = batch["attention_mask"]
286
+ context_lengths = batch["context_lengths"]
287
+
288
+ with accelerator.autocast():
289
+ output = model(input_ids, attention_mask=attention_mask)
290
+ logits = output.logits
291
+ loss_dict = compute_loss(
292
+ logits, input_ids, context_lengths, attention_mask
293
+ )
294
+
295
+ loss = loss_dict["loss"] / cfg.training.gradient_accumulation_steps
296
+ accelerator.backward(loss)
297
+
298
+ accumulated_loss += loss_dict["loss"].item()
299
+ accumulated_steps += 1
300
+
301
+ if accumulated_steps == cfg.training.gradient_accumulation_steps:
302
+ if cfg.training.max_grad_norm > 0:
303
+ accelerator.clip_grad_norm_(
304
+ model.parameters(), cfg.training.max_grad_norm
305
+ )
306
+
307
+ optimizer.step()
308
+ scheduler.step()
309
+ optimizer.zero_grad()
310
+
311
+ avg_loss = accumulated_loss / cfg.training.gradient_accumulation_steps
312
+ loss_meter.update(avg_loss)
313
+
314
+ global_step += 1
315
+
316
+ if global_step % cfg.logging.log_interval == 0:
317
+ step_time = time.time() - step_start_time
318
+ current_lr = scheduler.get_last_lr()[0]
319
+
320
+ metrics = {
321
+ "train/loss": loss_meter.val,
322
+ "train/loss_avg": loss_meter.avg,
323
+ "train/lr": current_lr,
324
+ "train/epoch": epoch,
325
+ "train/step_time": step_time / cfg.logging.log_interval,
326
+ }
327
+
328
+ log_metrics(metrics, step=global_step)
329
+
330
+ log_message(
331
+ f"Epoch {epoch} | Step {global_step} | "
332
+ f"Loss: {loss_meter.avg:.4f} | "
333
+ f"LR: {current_lr:.2e}",
334
+ cfg,
335
+ accelerator,
336
+ )
337
+
338
+ step_start_time = time.time()
339
+
340
+ if (
341
+ cfg.logging.save_interval > 0
342
+ and global_step % cfg.logging.save_interval == 0
343
+ ):
344
+ save_checkpoint(
345
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
346
+ )
347
+
348
+ eval_interval = cfg.logging.get("eval_interval", 0)
349
+ if (
350
+ eval_interval > 0
351
+ and val_dataloader is not None
352
+ and global_step % eval_interval == 0
353
+ ):
354
+ val_metrics = run_validation(
355
+ model=model,
356
+ dataloader=val_dataloader,
357
+ cfg=cfg,
358
+ global_step=global_step,
359
+ accelerator=accelerator,
360
+ forward_loss_fn=_pythia_forward_loss,
361
+ )
362
+
363
+ if val_metrics["val/loss"] < best_val_loss:
364
+ best_val_loss = val_metrics["val/loss"]
365
+ if accelerator.is_main_process:
366
+ best_model_path = Path(cfg.paths.output_dir) / "model_best.pt"
367
+ unwrapped_model = accelerator.unwrap_model(model)
368
+ torch.save(unwrapped_model.state_dict(), best_model_path)
369
+ log_message(
370
+ f"New best model saved! Val loss: {best_val_loss:.4f}",
371
+ cfg,
372
+ accelerator
373
+ )
374
+
375
+ log_metrics(
376
+ {
377
+ "best/val_loss": best_val_loss,
378
+ "best/val_perplexity": val_metrics["val/perplexity"],
379
+ "best/step": global_step,
380
+ },
381
+ step=global_step,
382
+ )
383
+
384
+ model.train()
385
+
386
+ accumulated_loss = 0.0
387
+ accumulated_steps = 0
388
+
389
+ epoch_time = time.time() - epoch_start_time
390
+
391
+ log_message(
392
+ f"Epoch {epoch} completed in {epoch_time:.2f}s | "
393
+ f"Loss: {loss_meter.avg:.4f}",
394
+ cfg,
395
+ accelerator,
396
+ )
397
+
398
+ log_metrics({
399
+ "epoch/loss": loss_meter.avg,
400
+ "epoch/time": epoch_time,
401
+ })
402
+
403
+ return global_step, best_val_loss
404
+
405
+
406
+ # ============================================================================
407
+ # MAIN
408
+ # ============================================================================
409
+
410
+
411
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
412
+ def main(cfg: DictConfig):
413
+ """Главная функция тренировки с поддержкой DDP через Accelerate."""
414
+
415
+ # === Performance: Enable TF32 for faster matmuls on Ampere+ GPUs ===
416
+ torch.set_float32_matmul_precision('high')
417
+
418
+ # === Accelerator Setup ===
419
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
420
+
421
+ accelerator = Accelerator(
422
+ mixed_precision=mixed_precision,
423
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
424
+ )
425
+
426
+ # === Setup ===
427
+ accelerate_set_seed(cfg.seed)
428
+
429
+ if cfg.paths.output_dir is None:
430
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
431
+
432
+ OmegaConf.resolve(cfg)
433
+
434
+ log_message(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}", cfg, accelerator)
435
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
436
+ log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
437
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
438
+
439
+ log_message("=" * 60, cfg, accelerator)
440
+ log_message("Pythia Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator)
441
+ log_message("=" * 60, cfg, accelerator)
442
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
443
+
444
+ # === Trackio Init ===
445
+ init_tracking(cfg, accelerator)
446
+
447
+ # === Tokenizer ===
448
+ log_message("Initializing tokenizer...", cfg, accelerator)
449
+ tokenizer = AutoTokenizer.from_pretrained(cfg.model.name)
450
+
451
+ if tokenizer.pad_token is None:
452
+ tokenizer.pad_token = tokenizer.eos_token
453
+ tokenizer.pad_token_id = tokenizer.eos_token_id
454
+
455
+ # === Model ===
456
+ log_message("Loading model...", cfg, accelerator)
457
+
458
+ # Flash Attention 2
459
+ torch_dtype = torch.bfloat16 if cfg.training.use_amp else torch.float32
460
+
461
+ if cfg.model.checkpoint_path:
462
+ model = AutoModelForCausalLM.from_pretrained(
463
+ cfg.model.name,
464
+ attn_implementation="flash_attention_2",
465
+ torch_dtype=torch_dtype,
466
+ )
467
+ checkpoint = torch.load(cfg.model.checkpoint_path, map_location="cpu")
468
+ model.load_state_dict(checkpoint["model_state_dict"] if "model_state_dict" in checkpoint else checkpoint)
469
+ log_message(f"Loaded checkpoint: {cfg.model.checkpoint_path}", cfg, accelerator)
470
+ elif cfg.model.from_scratch:
471
+ config = AutoConfig.from_pretrained(cfg.model.name)
472
+ config._attn_implementation = "flash_attention_2"
473
+ model = AutoModelForCausalLM.from_config(config, torch_dtype=torch_dtype)
474
+ log_message(f"Initialized from scratch: {cfg.model.name}", cfg, accelerator)
475
+ else:
476
+ model = AutoModelForCausalLM.from_pretrained(
477
+ cfg.model.name,
478
+ attn_implementation="flash_attention_2",
479
+ torch_dtype=torch_dtype,
480
+ )
481
+ log_message(f"Loaded pretrained: {cfg.model.name}", cfg, accelerator)
482
+
483
+ model.train()
484
+
485
+ # Log model info
486
+ total_params = sum(p.numel() for p in model.parameters())
487
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
488
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
489
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
490
+
491
+ # === Data ===
492
+ log_message("Creating dataloaders...", cfg, accelerator)
493
+ dataloaders = create_dataloaders(cfg, tokenizer)
494
+
495
+ train_dataloader = dataloaders["train"]
496
+ val_dataloader = dataloaders.get("validation", None)
497
+
498
+ log_message(f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator)
499
+ log_message(f"Train batches per epoch (before DDP split): {len(train_dataloader)}", cfg, accelerator)
500
+
501
+ if val_dataloader:
502
+ log_message(f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator)
503
+ log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
504
+ else:
505
+ log_message("No validation dataset found", cfg, accelerator)
506
+
507
+ # === Optimizer ===
508
+ log_message("Creating optimizer...", cfg, accelerator)
509
+ param_groups = group_params(model, cfg.training.weight_decay)
510
+
511
+ optimizer = torch.optim.AdamW(
512
+ param_groups,
513
+ lr=cfg.training.lr,
514
+ betas=tuple(cfg.training.betas),
515
+ eps=cfg.training.eps,
516
+ )
517
+
518
+ # === Scheduler ===
519
+ steps_per_epoch = math.ceil(
520
+ len(train_dataloader) / accelerator.num_processes
521
+ )
522
+ total_steps = (
523
+ cfg.training.epochs
524
+ * steps_per_epoch
525
+ // cfg.training.gradient_accumulation_steps
526
+ )
527
+ scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
528
+
529
+ log_message(
530
+ f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
531
+ cfg,
532
+ accelerator
533
+ )
534
+
535
+ # === Accelerate Prepare ===
536
+ log_message("Preparing model, optimizer, and dataloaders with Accelerate...", cfg, accelerator)
537
+
538
+ if val_dataloader is not None:
539
+ model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
540
+ model, optimizer, train_dataloader, val_dataloader, scheduler
541
+ )
542
+ else:
543
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
544
+ model, optimizer, train_dataloader, scheduler
545
+ )
546
+
547
+ log_message(f"Train batches per epoch (after DDP split): {len(train_dataloader)}", cfg, accelerator)
548
+
549
+ # === Resume ===
550
+ global_step = 0
551
+ start_epoch = 1
552
+
553
+ if cfg.training.resume and cfg.training.resume_checkpoint:
554
+ global_step, start_epoch = load_checkpoint(
555
+ model, optimizer, scheduler, cfg.training.resume_checkpoint, cfg, accelerator
556
+ )
557
+ start_epoch += 1
558
+
559
+ # === Training Loop ===
560
+ log_message("Starting training...", cfg, accelerator)
561
+
562
+ best_val_loss = float("inf")
563
+
564
+ try:
565
+ for epoch in range(start_epoch, cfg.training.epochs + 1):
566
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
567
+ log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
568
+ log_message(f"{'=' * 60}", cfg, accelerator)
569
+
570
+ global_step, best_val_loss = train_epoch(
571
+ model=model,
572
+ dataloader=train_dataloader,
573
+ optimizer=optimizer,
574
+ scheduler=scheduler,
575
+ cfg=cfg,
576
+ epoch=epoch,
577
+ global_step=global_step,
578
+ accelerator=accelerator,
579
+ val_dataloader=val_dataloader,
580
+ best_val_loss=best_val_loss,
581
+ )
582
+
583
+ if cfg.logging.save_every_epoch:
584
+ save_checkpoint(
585
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
586
+ )
587
+
588
+ except KeyboardInterrupt:
589
+ log_message("Training interrupted by user", cfg, accelerator)
590
+ save_checkpoint(model, optimizer, scheduler, global_step, epoch, cfg, accelerator)
591
+
592
+ # === Final Save ===
593
+ log_message("\nTraining completed!", cfg, accelerator)
594
+
595
+ if accelerator.is_main_process:
596
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
597
+ unwrapped_model = accelerator.unwrap_model(model)
598
+ torch.save(unwrapped_model.state_dict(), final_model_path)
599
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
600
+
601
+ accelerator.wait_for_everyone()
602
+ finish_tracking()
603
+
604
+
605
+ if __name__ == "__main__":
606
+ main()
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/config.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ code_path: code/code_completion_exp/train_pythia/train.py
7
+ python_version: 3.12.0
8
+ cli_version: 0.24.0
9
+ framework: huggingface
10
+ huggingface_version: 4.57.6
11
+ is_jupyter_run: false
12
+ is_kaggle_kernel: false
13
+ start_time: 1777154164
14
+ t:
15
+ 1:
16
+ - 1
17
+ - 11
18
+ - 49
19
+ - 50
20
+ - 51
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 49
27
+ - 50
28
+ - 51
29
+ - 71
30
+ - 105
31
+ 3:
32
+ - 2
33
+ - 13
34
+ - 16
35
+ - 37
36
+ - 42
37
+ - 61
38
+ 4: 3.12.0
39
+ 5: 0.24.0
40
+ 6: 4.57.6
41
+ 13: linux-x86_64
42
+ e:
43
+ iazobdn073n3heo7pq8hjqzxa01dq9xi:
44
+ os: Linux-5.4.0-176-generic-x86_64-with-glibc2.35
45
+ python: CPython 3.12.0
46
+ started_at: '2026-04-25T21:56:04.223508Z'
47
+ program: /workspace/byte-llms-code/code_completion_exp/train_pythia/train.py
48
+ code_path: code_completion_exp/train_pythia/train.py
49
+ code_path_local: train.py
50
+ git:
51
+ remote_url: https://github.com/naryst/byte-llms-code.git
52
+ commit: 0a7180b6ab9f63d2794494f09ec4918576d10fa2
53
+ email: nikita@local.ru
54
+ root: outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5
55
+ host: 7504e518d24a
56
+ executable: /venv/bytellm/bin/python
57
+ cpu_count: 64
58
+ cpu_count_logical: 128
59
+ gpu_type: NVIDIA H100 80GB HBM3
60
+ gpu_count: 4
61
+ disk:
62
+ /:
63
+ total: '265214230528'
64
+ used: '111477301248'
65
+ memory:
66
+ total: '1081679683584'
67
+ gpu_nvidia:
68
+ - name: NVIDIA H100 80GB HBM3
69
+ memory_total: '85520809984'
70
+ cuda_cores: 16896
71
+ architecture: Hopper
72
+ uuid: GPU-b60cdcab-2033-2009-41de-be646c953a20
73
+ - name: NVIDIA H100 80GB HBM3
74
+ memory_total: '85520809984'
75
+ cuda_cores: 16896
76
+ architecture: Hopper
77
+ uuid: GPU-9982b420-4520-4238-c378-ec5a46015474
78
+ - name: NVIDIA H100 80GB HBM3
79
+ memory_total: '85520809984'
80
+ cuda_cores: 16896
81
+ architecture: Hopper
82
+ uuid: GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f
83
+ - name: NVIDIA H100 80GB HBM3
84
+ memory_total: '85520809984'
85
+ cuda_cores: 16896
86
+ architecture: Hopper
87
+ uuid: GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134
88
+ cuda_version: '12.2'
89
+ writer_id: iazobdn073n3heo7pq8hjqzxa01dq9xi
90
+ model:
91
+ desc: null
92
+ value:
93
+ name: EleutherAI/pythia-1.4b
94
+ checkpoint_path: null
95
+ from_scratch: false
96
+ training:
97
+ desc: null
98
+ value:
99
+ epochs: 3
100
+ batch_size: 4
101
+ eval_batch_size: 12
102
+ gradient_accumulation_steps: 4
103
+ lr: 2.0e-05
104
+ weight_decay: 0.1
105
+ betas:
106
+ - 0.9
107
+ - 0.95
108
+ eps: 1.0e-08
109
+ lr_scheduler: wsd
110
+ warmup_ratio: 0.1
111
+ decay_ratio: 0.2
112
+ warmup_steps: 100
113
+ min_lr_ratio: 0.1
114
+ max_grad_norm: 1.0
115
+ use_amp: true
116
+ resume: false
117
+ resume_checkpoint: null
118
+ data:
119
+ desc: null
120
+ value:
121
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
122
+ max_context_len: 4096
123
+ max_target_len: 256
124
+ num_workers: 4
125
+ pin_memory: true
126
+ max_train_samples: null
127
+ max_val_samples: null
128
+ logging:
129
+ desc: null
130
+ value:
131
+ log_interval: 10
132
+ save_interval: 6000
133
+ eval_interval: 1000
134
+ save_every_epoch: true
135
+ tracking:
136
+ desc: null
137
+ value:
138
+ enabled: true
139
+ backend: wandb
140
+ project: code-completion_pythia-1.4b-rerun
141
+ run_name: pythia_1_4b_lr_2e-5
142
+ entity: null
143
+ base_url: https://wandb.platun0v.ru
144
+ local_dir: outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5
145
+ paths:
146
+ desc: null
147
+ value:
148
+ output_dir: outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5
149
+ seed:
150
+ desc: null
151
+ value: 42
152
+ device:
153
+ desc: null
154
+ value: cuda
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/requirements.txt ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==78.1.1
2
+ wheel==0.45.1
3
+ pip==25.2
4
+ webencodings==0.5.1
5
+ triton==3.2.0
6
+ pytz==2025.2
7
+ pydub==0.25.1
8
+ pure_eval==0.2.3
9
+ ptyprocess==0.7.0
10
+ nvidia-ml-py==13.590.48
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ ipython-genutils==0.2.0
14
+ fastjsonschema==2.21.2
15
+ brotli==1.2.0
16
+ antlr4-python3-runtime==4.9.3
17
+ xxhash==3.6.0
18
+ widgetsnbextension==4.0.14
19
+ websocket-client==1.9.0
20
+ webcolors==24.11.1
21
+ wcwidth==0.2.14
22
+ urllib3==2.5.0
23
+ uri-template==1.3.0
24
+ tzdata==2025.2
25
+ typing_extensions==4.15.0
26
+ types-python-dateutil==2.9.0.20251008
27
+ traitlets==5.14.3
28
+ tqdm==4.67.1
29
+ tornado==6.5.2
30
+ tomlkit==0.13.3
31
+ tinycss2==1.4.0
32
+ tabulate==0.9.0
33
+ sympy==1.13.1
34
+ soupsieve==2.8
35
+ sniffio==1.3.1
36
+ smmap==5.0.2
37
+ six==1.17.0
38
+ shellingham==1.5.4
39
+ Send2Trash==1.8.3
40
+ semantic-version==2.10.0
41
+ safetensors==0.6.2
42
+ rpds-py==0.27.1
43
+ rfc3986-validator==0.1.1
44
+ regex==2025.9.18
45
+ pyzmq==27.1.0
46
+ PyYAML==6.0.3
47
+ python-multipart==0.0.22
48
+ python-json-logger==4.0.0
49
+ python-dotenv==1.2.1
50
+ pyparsing==3.2.5
51
+ PyJWT==2.8.0
52
+ Pygments==2.19.2
53
+ pycparser==2.23
54
+ pyarrow==22.0.0
55
+ psutil==7.1.0
56
+ protobuf==6.33.4
57
+ propcache==0.4.1
58
+ prometheus_client==0.23.1
59
+ portalocker==3.2.0
60
+ platformdirs==4.5.0
61
+ pillow==11.3.0
62
+ pexpect==4.9.0
63
+ pathspec==1.0.4
64
+ parso==0.8.5
65
+ pandocfilters==1.5.1
66
+ packaging==25.0
67
+ orjson==3.11.6
68
+ opt_einsum==3.4.0
69
+ nvidia-nvtx-cu12==12.4.127
70
+ nvidia-nvjitlink-cu12==12.4.127
71
+ nvidia-nccl-cu12==2.21.5
72
+ nvidia-curand-cu12==10.3.5.147
73
+ nvidia-cufile-cu12==1.13.1.3
74
+ nvidia-cufft-cu12==11.2.1.3
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-cuda-nvrtc-cu12==12.4.127
77
+ nvidia-cuda-cupti-cu12==12.4.127
78
+ nvidia-cublas-cu12==12.4.5.8
79
+ numpy==2.3.3
80
+ ninja==1.13.0
81
+ networkx==3.5
82
+ nest-asyncio==1.6.0
83
+ narwhals==2.15.0
84
+ mypy_extensions==1.1.0
85
+ multidict==6.7.0
86
+ mistune==3.1.4
87
+ mdurl==0.1.2
88
+ MarkupSafe==3.0.3
89
+ lxml==6.0.2
90
+ librt==0.8.0
91
+ lark==1.3.0
92
+ kiwisolver==1.4.9
93
+ jupyterlab_widgets==3.0.15
94
+ jupyterlab_pygments==0.3.0
95
+ jsonpointer==3.0.0
96
+ json5==0.12.1
97
+ itsdangerous==2.2.0
98
+ idna==3.10
99
+ hf-xet==1.1.10
100
+ h11==0.16.0
101
+ groovy==0.1.2
102
+ fsspec==2025.9.0
103
+ frozenlist==1.8.0
104
+ fqdn==1.5.1
105
+ fonttools==4.60.1
106
+ filelock==3.19.1
107
+ ffmpy==1.0.0
108
+ executing==2.2.1
109
+ einops==0.8.1
110
+ dill==0.4.0
111
+ defusedxml==0.7.1
112
+ decorator==5.2.1
113
+ debugpy==1.8.17
114
+ dacite==1.9.2
115
+ cycler==0.12.1
116
+ comm==0.2.3
117
+ colorama==0.4.6
118
+ click==8.3.1
119
+ charset-normalizer==3.4.3
120
+ certifi==2025.10.5
121
+ bleach==6.2.0
122
+ babel==2.17.0
123
+ attrs==25.4.0
124
+ async-lru==2.0.5
125
+ asttokens==3.0.0
126
+ annotated-types==0.7.0
127
+ annotated-doc==0.0.4
128
+ aiohappyeyeballs==2.6.1
129
+ aiofiles==24.1.0
130
+ yarl==1.22.0
131
+ uvicorn==0.40.0
132
+ typing-inspection==0.4.2
133
+ terminado==0.18.1
134
+ stack-data==0.6.3
135
+ sentry-sdk==2.50.0
136
+ scipy==1.17.0
137
+ sacrebleu==2.6.0
138
+ rfc3987-syntax==1.1.0
139
+ rfc3339-validator==0.1.4
140
+ requests==2.32.5
141
+ reportlab==4.4.9
142
+ referencing==0.36.2
143
+ python-dateutil==2.9.0.post0
144
+ pydantic_core==2.41.5
145
+ prompt_toolkit==3.0.52
146
+ plotly==6.5.2
147
+ pathlib2==2.3.7.post1
148
+ orderedmultidict==1.0.2
149
+ optree==0.17.0
150
+ omegaconf==2.3.0
151
+ nvidia-cusparse-cu12==12.3.1.170
152
+ nvidia-cudnn-cu12==9.1.0.70
153
+ mypy==1.19.1
154
+ multiprocess==0.70.16
155
+ matplotlib-inline==0.1.7
156
+ markdown-it-py==4.0.0
157
+ jupyter_core==5.8.1
158
+ Jinja2==3.1.6
159
+ jedi==0.19.2
160
+ ipython_pygments_lexers==1.1.1
161
+ httpcore==1.0.9
162
+ gitdb==4.0.12
163
+ ftfy==6.3.1
164
+ contourpy==1.3.3
165
+ cffi==2.0.0
166
+ beautifulsoup4==4.14.2
167
+ anyio==4.11.0
168
+ aiosignal==1.4.0
169
+ starlette==0.50.0
170
+ rich==14.2.0
171
+ pydantic==2.12.5
172
+ pandas==2.3.3
173
+ nvidia-cusolver-cu12==11.6.1.9
174
+ matplotlib==3.10.7
175
+ jupyter_server_terminals==0.5.3
176
+ jupyter_client==8.6.3
177
+ jsonschema-specifications==2025.9.1
178
+ ipython==9.6.0
179
+ hydra-core==1.3.2
180
+ huggingface-hub==0.35.3
181
+ httpx==0.28.1
182
+ GitPython==3.1.46
183
+ furl==2.1.4
184
+ cryptography==46.0.4
185
+ arrow==1.3.0
186
+ argon2-cffi-bindings==25.1.0
187
+ aiohttp==3.13.1
188
+ wandb==0.24.0
189
+ typer==0.21.1
190
+ torch==2.6.0
191
+ tokenizers==0.22.1
192
+ seaborn==0.13.2
193
+ safehttpx==0.1.7
194
+ jsonschema==4.25.1
195
+ joypy==0.2.6
196
+ isoduration==20.11.0
197
+ ipywidgets==8.1.7
198
+ ipykernel==6.30.1
199
+ gradio_client==2.0.3
200
+ fastapi==0.128.0
201
+ Authlib==1.6.6
202
+ argon2-cffi==25.1.0
203
+ transformers==4.57.6
204
+ nbformat==5.10.4
205
+ mlstm_kernels==2.0.2
206
+ jupyter-console==6.6.3
207
+ gradio==6.5.1
208
+ datasets==4.3.0
209
+ clearml==1.16.4
210
+ accelerate==1.10.1
211
+ xlstm==2.0.4
212
+ nbclient==0.10.2
213
+ jupyter-events==0.12.0
214
+ trackio==0.15.0
215
+ nbconvert==7.16.6
216
+ jupyter_server==2.17.0
217
+ notebook_shim==0.2.4
218
+ jupyterlab_server==2.27.3
219
+ jupyter-lsp==2.3.0
220
+ nbclassic==1.3.3
221
+ jupyterlab==4.4.9
222
+ notebook==7.4.7
223
+ jupyter_contrib_core==0.4.2
224
+ jupyter==1.1.1
225
+ jupyter_nbextensions_configurator==0.6.4
226
+ causal-conv1d==1.5.0.post8
227
+ flash_attn==2.7.4.post1
228
+ mamba-ssm==2.2.4
229
+ hnet==0.0.1
230
+ autocommand==2.2.2
231
+ backports.tarfile==1.2.0
232
+ importlib_metadata==8.0.0
233
+ inflect==7.3.1
234
+ jaraco.collections==5.1.0
235
+ jaraco.context==5.3.0
236
+ jaraco.functools==4.0.1
237
+ jaraco.text==3.12.1
238
+ more-itertools==10.3.0
239
+ packaging==24.2
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ typeguard==4.3.0
243
+ typing_extensions==4.12.2
244
+ wheel==0.45.1
245
+ zipp==3.19.2
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/wandb-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"os": "Linux-5.4.0-176-generic-x86_64-with-glibc2.35", "python": "CPython 3.12.0", "started_at": "2026-04-25T21:56:04.223508Z", "program": "/workspace/byte-llms-code/code_completion_exp/train_pythia/train.py", "code_path": "code_completion_exp/train_pythia/train.py", "code_path_local": "train.py", "git": {"remote_url": "https://github.com/naryst/byte-llms-code.git", "commit": "0a7180b6ab9f63d2794494f09ec4918576d10fa2"}, "email": "nikita@local.ru", "root": "outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5", "host": "7504e518d24a", "executable": "/venv/bytellm/bin/python", "cpu_count": 64, "cpu_count_logical": 128, "gpu_type": "NVIDIA H100 80GB HBM3", "gpu_count": 4, "disk": {"/": {"total": "265214230528", "used": "111477301248"}}, "memory": {"total": "1081679683584"}, "gpu_nvidia": [{"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-b60cdcab-2033-2009-41de-be646c953a20"}, {"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-9982b420-4520-4238-c378-ec5a46015474"}, {"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f"}, {"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134"}], "cuda_version": "12.2", "writer_id": "iazobdn073n3heo7pq8hjqzxa01dq9xi"}
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime": 13701, "_timestamp": 1777167844.718428, "_step": 29660, "train/loss": 0.6748308092355728, "train/loss_avg": 0.7016983942730095, "train/lr": 2.0000000000000003e-06, "train/epoch": 3, "train/step_time": 0.3627600908279419, "val/loss": 0.9848005849860421, "val/time": 92.58129596710205, "best/val_perplexity": 2.6676781909665626, "best/step": 19000, "val/perplexity": 2.712564749615551, "best/val_loss": 0.9681412068259936, "epoch/loss": 0.7017153006575418, "epoch/time": 4552.06911277771}
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T21:56:04.307874507Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmptd7__oni/port-185429.txt","pid":185429,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-25T21:56:04.308305608Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":185429}
3
+ {"time":"2026-04-25T21:56:04.30830261Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-185429-185491-3756380617/socket","Net":"unix"}}
4
+ {"time":"2026-04-25T21:56:04.495821383Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-25T21:56:04.515259356Z","level":"INFO","msg":"handleInformInit: received","streamId":"jhqe4qjw","id":"1(@)"}
6
+ {"time":"2026-04-25T21:56:04.914894772Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"jhqe4qjw","id":"1(@)"}
7
+ {"time":"2026-04-26T01:44:27.124695645Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jhqe4qjw","id":"1(@)"}
8
+ {"time":"2026-04-26T01:44:27.125165614Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jhqe4qjw","id":"1(@)"}
9
+ {"time":"2026-04-26T01:44:27.144975551Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2026-04-26T01:44:27.145027858Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2026-04-26T01:44:27.145044227Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
12
+ {"time":"2026-04-26T01:44:27.145084129Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
13
+ {"time":"2026-04-26T01:44:27.145088296Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2026-04-26T01:44:27.145089567Z","level":"INFO","msg":"server is shutting down"}
15
+ {"time":"2026-04-26T01:44:27.145199385Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-185429-185491-3756380617/socket","Net":"unix"}}
16
+ {"time":"2026-04-26T01:44:27.145231896Z","level":"INFO","msg":"server is closed"}
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T21:56:04.515355828Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-04-25T21:56:04.914734923Z","level":"INFO","msg":"stream: created new stream","id":"jhqe4qjw"}
3
+ {"time":"2026-04-25T21:56:04.91479529Z","level":"INFO","msg":"handler: started","stream_id":"jhqe4qjw"}
4
+ {"time":"2026-04-25T21:56:04.914888584Z","level":"INFO","msg":"stream: started","id":"jhqe4qjw"}
5
+ {"time":"2026-04-25T21:56:04.914893063Z","level":"INFO","msg":"writer: started","stream_id":"jhqe4qjw"}
6
+ {"time":"2026-04-25T21:56:04.914903421Z","level":"INFO","msg":"sender: started","stream_id":"jhqe4qjw"}
7
+ {"time":"2026-04-25T21:56:05.060789967Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-04-25T23:20:15.280635585Z","level":"ERROR","msg":"api: HTTP error","status":403,"method":"POST","url":"https://wandb.platun0v.ru/files/nikita/code-completion_pythia-1.4b-rerun/jhqe4qjw/file_stream"}
9
+ {"time":"2026-04-25T23:20:15.280696408Z","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 403 Forbidden url=https://wandb.platun0v.ru/files/nikita/code-completion_pythia-1.4b-rerun/jhqe4qjw/file_stream: "}
10
+ {"time":"2026-04-26T01:44:27.121098166Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2026-04-26T01:44:27.121923843Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2026-04-26T01:44:27.124737334Z","level":"INFO","msg":"stream: closing","id":"jhqe4qjw"}
13
+ {"time":"2026-04-26T01:44:27.124749132Z","level":"INFO","msg":"handler: closed","stream_id":"jhqe4qjw"}
14
+ {"time":"2026-04-26T01:44:27.124842763Z","level":"INFO","msg":"sender: closed","stream_id":"jhqe4qjw"}
15
+ {"time":"2026-04-26T01:44:27.124848143Z","level":"INFO","msg":"stream: closed","id":"jhqe4qjw"}
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-25 21:56:04,224 INFO MainThread:185429 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-04-25 21:56:04,224 INFO MainThread:185429 [wandb_setup.py:_flush():81] Configure stats pid to 185429
3
+ 2026-04-25 21:56:04,224 INFO MainThread:185429 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug.log
5
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/logs/debug-internal.log
6
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'name': 'EleutherAI/pythia-1.4b', 'checkpoint_path': None, 'from_scratch': False}, 'training': {'epochs': 3, 'batch_size': 4, 'eval_batch_size': 12, 'gradient_accumulation_steps': 4, 'lr': 2e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 4, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 6000, 'eval_interval': 1000, 'save_every_epoch': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_pythia-1.4b-rerun', 'run_name': 'pythia_1_4b_lr_2e-5', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5'}, 'paths': {'output_dir': 'outputs/pythia_1_4b_rerun/pythia_1_4b_lr_2e-5'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_pythia/train.py'}}
9
+ 2026-04-25 21:56:04,225 INFO MainThread:185429 [wandb_init.py:init():892] starting backend
10
+ 2026-04-25 21:56:04,496 INFO MainThread:185429 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-25 21:56:04,514 INFO MainThread:185429 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-25 21:56:04,517 INFO MainThread:185429 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-25 21:56:04,531 INFO MainThread:185429 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-25 21:56:05,060 INFO MainThread:185429 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-04-25 21:56:05,217 INFO MainThread:185429 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-04-25 21:56:05,220 INFO MainThread:185429 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-04-26 01:44:26,160 INFO MainThread:185429 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_pythia-1.4b-rerun/jhqe4qjw
21
+ 2026-04-26 01:44:26,160 INFO MainThread:185429 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-04-26 01:44:26,161 INFO MainThread:185429 [wandb_run.py:_restore():2476] restore
23
+ 2026-04-26 01:44:26,161 INFO MainThread:185429 [wandb_run.py:_restore():2482] restore done
24
+ 2026-04-26 01:44:27,124 INFO MainThread:185429 [wandb_run.py:_footer_sync_info():3870] logging synced files
pythia_1_4b_rerun/pythia_1_4b_lr_2e-5/wandb/run-20260425_215604-jhqe4qjw/run-jhqe4qjw.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:547dc16fb780cece720bb77317513532a4d9564e9b81b6d1445bbd19e2123eff
3
+ size 5773984